From c601518ca0649a091b2eac117361680545f18b5b Mon Sep 17 00:00:00 2001 From: Xueshen Liu Date: Sun, 25 Jun 2023 12:15:49 -0400 Subject: [PATCH 1/2] add compile time launch bound --- gpu/.depend | 304 +++++++++---------------------------------------- gpu/gpu.mk | 2 + gpu/plscore.cu | 45 +++++++- 3 files changed, 93 insertions(+), 258 deletions(-) diff --git a/gpu/.depend b/gpu/.depend index f5150023..f44c25e2 100644 --- a/gpu/.depend +++ b/gpu/.depend @@ -1,252 +1,52 @@ -plchain.o : gpu/plchain.cu \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/cuda_runtime.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/crt/host_config.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/builtin_types.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/device_types.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/crt/host_defines.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/driver_types.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/vector_types.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/surface_types.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/texture_types.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/library_types.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/channel_descriptor.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/cuda_runtime_api.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/cuda_device_runtime_api.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/driver_functions.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/vector_functions.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/vector_functions.hpp \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/crt/common_functions.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/crt/math_functions.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/crt/math_functions.hpp \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/crt/device_functions.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/crt/device_functions.hpp \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/device_atomic_functions.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/device_atomic_functions.hpp \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/crt/device_double_functions.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/crt/device_double_functions.hpp \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/sm_20_atomic_functions.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/sm_20_atomic_functions.hpp \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/sm_32_atomic_functions.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/sm_32_atomic_functions.hpp \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/sm_35_atomic_functions.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/sm_60_atomic_functions.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/sm_60_atomic_functions.hpp \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/sm_20_intrinsics.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/sm_20_intrinsics.hpp \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/sm_30_intrinsics.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/sm_30_intrinsics.hpp \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/sm_32_intrinsics.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/sm_32_intrinsics.hpp \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/sm_35_intrinsics.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/sm_61_intrinsics.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/sm_61_intrinsics.hpp \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/crt/sm_70_rt.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/crt/sm_70_rt.hpp \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/crt/sm_80_rt.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/crt/sm_80_rt.hpp \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/crt/sm_90_rt.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/crt/sm_90_rt.hpp \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/texture_indirect_functions.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/surface_indirect_functions.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/crt/cudacc_ext.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/device_launch_parameters.h \ - ./mmpriv.h \ - ./minimap.h \ - ./bseq.h \ - ./kseq.h \ - gpu/plmem.cuh \ - gpu/hipify.cuh \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/cuda.h \ - gpu/plchain.h \ - gpu/plutils.h \ - ./kalloc.h \ - gpu/plrange.cuh \ - gpu/plscore.cuh -plrange.o : gpu/plrange.cu \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/cuda_runtime.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/crt/host_config.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/builtin_types.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/device_types.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/crt/host_defines.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/driver_types.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/vector_types.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/surface_types.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/texture_types.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/library_types.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/channel_descriptor.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/cuda_runtime_api.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/cuda_device_runtime_api.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/driver_functions.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/vector_functions.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/vector_functions.hpp \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/crt/common_functions.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/crt/math_functions.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/crt/math_functions.hpp \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/crt/device_functions.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/crt/device_functions.hpp \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/device_atomic_functions.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/device_atomic_functions.hpp \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/crt/device_double_functions.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/crt/device_double_functions.hpp \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/sm_20_atomic_functions.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/sm_20_atomic_functions.hpp \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/sm_32_atomic_functions.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/sm_32_atomic_functions.hpp \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/sm_35_atomic_functions.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/sm_60_atomic_functions.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/sm_60_atomic_functions.hpp \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/sm_20_intrinsics.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/sm_20_intrinsics.hpp \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/sm_30_intrinsics.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/sm_30_intrinsics.hpp \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/sm_32_intrinsics.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/sm_32_intrinsics.hpp \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/sm_35_intrinsics.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/sm_61_intrinsics.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/sm_61_intrinsics.hpp \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/crt/sm_70_rt.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/crt/sm_70_rt.hpp \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/crt/sm_80_rt.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/crt/sm_80_rt.hpp \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/crt/sm_90_rt.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/crt/sm_90_rt.hpp \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/texture_indirect_functions.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/surface_indirect_functions.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/crt/cudacc_ext.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/device_launch_parameters.h \ - gpu/plrange.cuh \ - gpu/plmem.cuh \ - gpu/hipify.cuh \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/cuda.h \ - gpu/plchain.h \ - gpu/plutils.h \ - ./kalloc.h \ - ./minimap.h -plmem.o : gpu/plmem.cu \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/cuda_runtime.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/crt/host_config.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/builtin_types.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/device_types.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/crt/host_defines.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/driver_types.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/vector_types.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/surface_types.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/texture_types.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/library_types.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/channel_descriptor.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/cuda_runtime_api.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/cuda_device_runtime_api.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/driver_functions.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/vector_functions.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/vector_functions.hpp \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/crt/common_functions.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/crt/math_functions.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/crt/math_functions.hpp \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/crt/device_functions.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/crt/device_functions.hpp \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/device_atomic_functions.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/device_atomic_functions.hpp \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/crt/device_double_functions.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/crt/device_double_functions.hpp \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/sm_20_atomic_functions.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/sm_20_atomic_functions.hpp \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/sm_32_atomic_functions.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/sm_32_atomic_functions.hpp \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/sm_35_atomic_functions.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/sm_60_atomic_functions.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/sm_60_atomic_functions.hpp \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/sm_20_intrinsics.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/sm_20_intrinsics.hpp \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/sm_30_intrinsics.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/sm_30_intrinsics.hpp \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/sm_32_intrinsics.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/sm_32_intrinsics.hpp \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/sm_35_intrinsics.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/sm_61_intrinsics.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/sm_61_intrinsics.hpp \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/crt/sm_70_rt.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/crt/sm_70_rt.hpp \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/crt/sm_80_rt.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/crt/sm_80_rt.hpp \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/crt/sm_90_rt.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/crt/sm_90_rt.hpp \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/texture_indirect_functions.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/surface_indirect_functions.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/crt/cudacc_ext.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/device_launch_parameters.h \ - gpu/plmem.cuh \ - gpu/hipify.cuh \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/cuda.h \ - gpu/plchain.h \ - gpu/plutils.h \ - ./kalloc.h \ - ./minimap.h \ - gpu/plrange.cuh \ - gpu/plscore.cuh \ - ./mmpriv.h \ - ./bseq.h \ - ./kseq.h \ - cJSON/cJSON.h -plscore.o : gpu/plscore.cu \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/cuda_runtime.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/crt/host_config.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/builtin_types.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/device_types.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/crt/host_defines.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/driver_types.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/vector_types.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/surface_types.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/texture_types.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/library_types.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/channel_descriptor.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/cuda_runtime_api.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/cuda_device_runtime_api.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/driver_functions.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/vector_functions.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/vector_functions.hpp \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/crt/common_functions.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/crt/math_functions.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/crt/math_functions.hpp \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/crt/device_functions.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/crt/device_functions.hpp \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/device_atomic_functions.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/device_atomic_functions.hpp \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/crt/device_double_functions.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/crt/device_double_functions.hpp \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/sm_20_atomic_functions.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/sm_20_atomic_functions.hpp \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/sm_32_atomic_functions.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/sm_32_atomic_functions.hpp \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/sm_35_atomic_functions.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/sm_60_atomic_functions.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/sm_60_atomic_functions.hpp \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/sm_20_intrinsics.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/sm_20_intrinsics.hpp \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/sm_30_intrinsics.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/sm_30_intrinsics.hpp \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/sm_32_intrinsics.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/sm_32_intrinsics.hpp \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/sm_35_intrinsics.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/sm_61_intrinsics.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/sm_61_intrinsics.hpp \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/crt/sm_70_rt.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/crt/sm_70_rt.hpp \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/crt/sm_80_rt.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/crt/sm_80_rt.hpp \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/crt/sm_90_rt.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/crt/sm_90_rt.hpp \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/texture_indirect_functions.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/surface_indirect_functions.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/crt/cudacc_ext.h \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/device_launch_parameters.h \ - gpu/plscore.cuh \ - gpu/plmem.cuh \ - gpu/hipify.cuh \ - /usr/local/cuda-12/bin/../targets/x86_64-linux/include/cuda.h \ - gpu/plchain.h \ - gpu/plutils.h \ - ./kalloc.h \ - ./minimap.h \ - ./mmpriv.h \ - ./bseq.h \ - ./kseq.h + +# __CLANG_OFFLOAD_BUNDLE____START__ hip-amdgcn-amd-amdhsa-gfx1030 +plchain.o: gpu/plchain.cu mmpriv.h minimap.h bseq.h kseq.h gpu/plmem.cuh \ + gpu/hipify.cuh gpu/plchain.h gpu/plutils.h kalloc.h gpu/plrange.cuh \ + gpu/plscore.cuh + +# __CLANG_OFFLOAD_BUNDLE____END__ hip-amdgcn-amd-amdhsa-gfx1030 + +# __CLANG_OFFLOAD_BUNDLE____START__ host-x86_64-unknown-linux-gnu +plchain.o: gpu/plchain.cu mmpriv.h minimap.h bseq.h kseq.h gpu/plmem.cuh \ + gpu/hipify.cuh gpu/plchain.h gpu/plutils.h kalloc.h gpu/plrange.cuh \ + gpu/plscore.cuh + +# __CLANG_OFFLOAD_BUNDLE____END__ host-x86_64-unknown-linux-gnu + +# __CLANG_OFFLOAD_BUNDLE____START__ hip-amdgcn-amd-amdhsa-gfx1030 +plmem.o: gpu/plmem.cu gpu/plmem.cuh gpu/hipify.cuh gpu/plchain.h \ + gpu/plutils.h kalloc.h minimap.h gpu/plrange.cuh gpu/plscore.cuh \ + mmpriv.h bseq.h kseq.h cJSON/cJSON.h + +# __CLANG_OFFLOAD_BUNDLE____END__ hip-amdgcn-amd-amdhsa-gfx1030 + +# __CLANG_OFFLOAD_BUNDLE____START__ host-x86_64-unknown-linux-gnu +plmem.o: gpu/plmem.cu gpu/plmem.cuh gpu/hipify.cuh gpu/plchain.h \ + gpu/plutils.h kalloc.h minimap.h gpu/plrange.cuh gpu/plscore.cuh \ + mmpriv.h bseq.h kseq.h cJSON/cJSON.h + +# __CLANG_OFFLOAD_BUNDLE____END__ host-x86_64-unknown-linux-gnu + +# __CLANG_OFFLOAD_BUNDLE____START__ hip-amdgcn-amd-amdhsa-gfx1030 +plrange.o: gpu/plrange.cu gpu/plrange.cuh gpu/plmem.cuh gpu/hipify.cuh \ + gpu/plchain.h gpu/plutils.h kalloc.h minimap.h + +# __CLANG_OFFLOAD_BUNDLE____END__ hip-amdgcn-amd-amdhsa-gfx1030 + +# __CLANG_OFFLOAD_BUNDLE____START__ host-x86_64-unknown-linux-gnu +plrange.o: gpu/plrange.cu gpu/plrange.cuh gpu/plmem.cuh gpu/hipify.cuh \ + gpu/plchain.h gpu/plutils.h kalloc.h minimap.h + +# __CLANG_OFFLOAD_BUNDLE____END__ host-x86_64-unknown-linux-gnu + +# __CLANG_OFFLOAD_BUNDLE____START__ hip-amdgcn-amd-amdhsa-gfx1030 +plscore.o: gpu/plscore.cu gpu/plscore.cuh gpu/plmem.cuh gpu/hipify.cuh \ + gpu/plchain.h gpu/plutils.h kalloc.h minimap.h mmpriv.h bseq.h kseq.h + +# __CLANG_OFFLOAD_BUNDLE____END__ hip-amdgcn-amd-amdhsa-gfx1030 + +# __CLANG_OFFLOAD_BUNDLE____START__ host-x86_64-unknown-linux-gnu +plscore.o: gpu/plscore.cu gpu/plscore.cuh gpu/plmem.cuh gpu/hipify.cuh \ + gpu/plchain.h gpu/plutils.h kalloc.h minimap.h mmpriv.h bseq.h kseq.h + +# __CLANG_OFFLOAD_BUNDLE____END__ host-x86_64-unknown-linux-gnu diff --git a/gpu/gpu.mk b/gpu/gpu.mk index 1668f98f..df73ba03 100644 --- a/gpu/gpu.mk +++ b/gpu/gpu.mk @@ -1,5 +1,7 @@ GPU ?= AMD CONFIG = $(if $(GPU_CONFIG),-DGPU_CONFIG='"$(GPU_CONFIG)"') +CONFIG += $(if $(LONG_BLOCK_SIZE),-D__LONG_BLOCK_SIZE__=\($(LONG_BLOCK_SIZE)\)) +CONFIG += $(if $(SHORT_BLOCK_SIZE),-D__SHORT_BLOCK_SIZE__=\($(SHORT_BLOCK_SIZE)\)) ################################################### ############ CPU Compile ################### diff --git a/gpu/plscore.cu b/gpu/plscore.cu index 09e73db1..b3672bec 100644 --- a/gpu/plscore.cu +++ b/gpu/plscore.cu @@ -159,7 +159,9 @@ inline __device__ void compute_sc_long_seg_one_wf(const int64_t* anchors_x, cons /* kernels begin */ - +#ifdef __SHORT_BLOCK_SIZE__ +__launch_bounds__(__SHORT_BLOCK_SIZE__) +#endif __global__ void score_generation_short( /* Input: Anchor & Range Inputs */ const int64_t* anchors_x, const int64_t* anchors_y, int32_t *range, @@ -203,7 +205,9 @@ __global__ void score_generation_short( } } - +#ifdef __LONG_BLOCK_SIZE__ +__launch_bounds__(__LONG_BLOCK_SIZE__) +#endif __global__ void score_generation_long(const int64_t* anchors_x, const int64_t* anchors_y, int32_t *range, seg_t *long_seg, unsigned int* long_seg_count, int32_t* f, uint16_t* p){ @@ -266,7 +270,6 @@ void plscore_upload_misc(Misc input_misc) { void plscore_async_long_short_forward_dp(deviceMemPtr* dev_mem, cudaStream_t* stream) { size_t total_n = dev_mem->total_n; size_t cut_num = dev_mem->num_cut; - dim3 shortDimBlock(score_kernel_config.short_blockdim, 1, 1); dim3 shortDimGrid(score_kernel_config.short_griddim, 1, 1); dim3 longDimGrid(score_kernel_config.long_griddim, 1, 1); @@ -274,16 +277,33 @@ void plscore_async_long_short_forward_dp(deviceMemPtr* dev_mem, cudaStream_t* st // printf("Grid Dim, %d\n", DimGrid.x); cudaMemsetAsync(dev_mem->d_long_seg_count, 0, sizeof(unsigned int), *stream); + + #ifdef __SHORT_BLOCK_SIZE__ + printf("short block size: %d\n", __SHORT_BLOCK_SIZE__); + score_generation_short<<>>( + dev_mem->d_ax, dev_mem->d_ay, dev_mem->d_range, + dev_mem->d_cut, dev_mem->d_f, dev_mem->d_p, total_n, cut_num, + dev_mem->d_long_seg, dev_mem->d_long_seg_count); + #else + dim3 shortDimBlock(score_kernel_config.short_blockdim, 1, 1); score_generation_short<<>>( dev_mem->d_ax, dev_mem->d_ay, dev_mem->d_range, dev_mem->d_cut, dev_mem->d_f, dev_mem->d_p, total_n, cut_num, dev_mem->d_long_seg, dev_mem->d_long_seg_count); + #endif cudaCheck(); + #ifdef __LONG_BLOCK_SIZE__ + printf("long block size: %d\n", __LONG_BLOCK_SIZE__); + score_generation_long<<>>( + dev_mem->d_ax, dev_mem->d_ay, dev_mem->d_range, dev_mem->d_long_seg, + dev_mem->d_long_seg_count, dev_mem->d_f, dev_mem->d_p); + #else dim3 longDimBlock(score_kernel_config.long_blockdim, 1, 1); score_generation_long<<>>( dev_mem->d_ax, dev_mem->d_ay, dev_mem->d_range, dev_mem->d_long_seg, dev_mem->d_long_seg_count, dev_mem->d_f, dev_mem->d_p); + #endif cudaCheck(); #ifdef DEBUG_VERBOSE fprintf(stderr, "[M::%s] score generation success\n", __func__); @@ -325,14 +345,22 @@ void plscore_sync_long_short_forward_dp(deviceMemPtr* dev_mem, Misc misc_) { size_t total_n = dev_mem->total_n; size_t cut_num = dev_mem->num_cut; plscore_upload_misc(misc_); - dim3 shortDimBlock(score_kernel_config.short_blockdim, 1, 1); dim3 longDimGrid(score_kernel_config.long_griddim, 1, 1); dim3 shortDimGrid(score_kernel_config.short_griddim, 1, 1); cudaMemset(dev_mem->d_long_seg_count, 0, sizeof(unsigned int)); + #ifdef __SHORT_BLOCK_SIZE__ + printf("short block size: %d\n", __SHORT_BLOCK_SIZE__); + score_generation_short<<>>( + dev_mem->d_ax, dev_mem->d_ay, dev_mem->d_range, + dev_mem->d_cut, dev_mem->d_f, dev_mem->d_p, total_n, cut_num, + dev_mem->d_long_seg, dev_mem->d_long_seg_count); + #else + dim3 shortDimBlock(score_kernel_config.short_blockdim, 1, 1); score_generation_short<<>>( dev_mem->d_ax, dev_mem->d_ay, dev_mem->d_range, dev_mem->d_cut, dev_mem->d_f, dev_mem->d_p, total_n, cut_num, dev_mem->d_long_seg, dev_mem->d_long_seg_count); + #endif cudaCheck(); cudaDeviceSynchronize(); @@ -353,12 +381,17 @@ void plscore_sync_long_short_forward_dp(deviceMemPtr* dev_mem, Misc misc_) { // cudaMemcpy(elapsed_clk, d_clk, sizeof(long long int)*DimGrid.x, cudaMemcpyDeviceToHost); #endif // DEBUG_CHECK + #ifdef __LONG_BLOCK_SIZE__ + printf("long block size: %d\n", __LONG_BLOCK_SIZE__); + score_generation_long<<>>( + dev_mem->d_ax, dev_mem->d_ay, dev_mem->d_range, dev_mem->d_long_seg, dev_mem->d_long_seg_count, + dev_mem->d_f, dev_mem->d_p); + #else dim3 longDimBlock(score_kernel_config.long_blockdim, 1, 1); - score_generation_long<<>>( dev_mem->d_ax, dev_mem->d_ay, dev_mem->d_range, dev_mem->d_long_seg, dev_mem->d_long_seg_count, dev_mem->d_f, dev_mem->d_p); - + #endif cudaCheck(); cudaDeviceSynchronize(); cudaCheck(); From e6f016c619f95b6c91ffeb676004996448481b95 Mon Sep 17 00:00:00 2001 From: Xueshen Liu Date: Sun, 25 Jun 2023 12:21:50 -0400 Subject: [PATCH 2/2] add compile time launch bound using template --- gpu/plscore.cu | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/gpu/plscore.cu b/gpu/plscore.cu index b3672bec..2af16cba 100644 --- a/gpu/plscore.cu +++ b/gpu/plscore.cu @@ -160,7 +160,8 @@ inline __device__ void compute_sc_long_seg_one_wf(const int64_t* anchors_x, cons /* kernels begin */ #ifdef __SHORT_BLOCK_SIZE__ -__launch_bounds__(__SHORT_BLOCK_SIZE__) +template +__launch_bounds__(short_block_size) #endif __global__ void score_generation_short( /* Input: Anchor & Range Inputs */ @@ -206,7 +207,8 @@ __global__ void score_generation_short( } #ifdef __LONG_BLOCK_SIZE__ -__launch_bounds__(__LONG_BLOCK_SIZE__) +template +__launch_bounds__(long_block_size) #endif __global__ void score_generation_long(const int64_t* anchors_x, const int64_t* anchors_y, int32_t *range, seg_t *long_seg, unsigned int* long_seg_count, @@ -279,8 +281,8 @@ void plscore_async_long_short_forward_dp(deviceMemPtr* dev_mem, cudaStream_t* st *stream); #ifdef __SHORT_BLOCK_SIZE__ - printf("short block size: %d\n", __SHORT_BLOCK_SIZE__); - score_generation_short<<>>( + fprintf(stderr, "short block size: %d\n", __SHORT_BLOCK_SIZE__); + score_generation_short<__SHORT_BLOCK_SIZE__><<>>( dev_mem->d_ax, dev_mem->d_ay, dev_mem->d_range, dev_mem->d_cut, dev_mem->d_f, dev_mem->d_p, total_n, cut_num, dev_mem->d_long_seg, dev_mem->d_long_seg_count); @@ -294,8 +296,8 @@ void plscore_async_long_short_forward_dp(deviceMemPtr* dev_mem, cudaStream_t* st cudaCheck(); #ifdef __LONG_BLOCK_SIZE__ - printf("long block size: %d\n", __LONG_BLOCK_SIZE__); - score_generation_long<<>>( + fprintf(stderr, "long block size: %d\n", __LONG_BLOCK_SIZE__); + score_generation_long<__LONG_BLOCK_SIZE__><<>>( dev_mem->d_ax, dev_mem->d_ay, dev_mem->d_range, dev_mem->d_long_seg, dev_mem->d_long_seg_count, dev_mem->d_f, dev_mem->d_p); #else @@ -350,7 +352,7 @@ void plscore_sync_long_short_forward_dp(deviceMemPtr* dev_mem, Misc misc_) { cudaMemset(dev_mem->d_long_seg_count, 0, sizeof(unsigned int)); #ifdef __SHORT_BLOCK_SIZE__ printf("short block size: %d\n", __SHORT_BLOCK_SIZE__); - score_generation_short<<>>( + score_generation_short<__SHORT_BLOCK_SIZE__><<>>( dev_mem->d_ax, dev_mem->d_ay, dev_mem->d_range, dev_mem->d_cut, dev_mem->d_f, dev_mem->d_p, total_n, cut_num, dev_mem->d_long_seg, dev_mem->d_long_seg_count); @@ -383,7 +385,7 @@ void plscore_sync_long_short_forward_dp(deviceMemPtr* dev_mem, Misc misc_) { #ifdef __LONG_BLOCK_SIZE__ printf("long block size: %d\n", __LONG_BLOCK_SIZE__); - score_generation_long<<>>( + score_generation_long<__LONG_BLOCK_SIZE__><<>>( dev_mem->d_ax, dev_mem->d_ay, dev_mem->d_range, dev_mem->d_long_seg, dev_mem->d_long_seg_count, dev_mem->d_f, dev_mem->d_p); #else