-
Notifications
You must be signed in to change notification settings - Fork 176
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[Bug]: EFA is incompatible with Mellanox rdma #308
Labels
bug
Report errors or unexpected behavior
Linux EFA driver
triage
Determine the priority and severity
Comments
vmarkovtsev
added
bug
Report errors or unexpected behavior
triage
Determine the priority and severity
labels
Jul 5, 2024
I made it work with the following changes in EFA dkms. Please let me know how I can contribute these changes. diff -ruN efa-2.6.0/config/efa.cmake efa-2.6.0/config/efa.cmake
--- efa-2.6.0/config/efa.cmake 2024-07-05 21:41:00.045808694 +0200
+++ efa-2.6.0/config/efa.cmake 2024-07-05 21:31:20.625037540 +0200
@@ -11,13 +11,15 @@
set(tmp_dir ${tmp_dir} PARENT_SCOPE)
configure_file(${CMAKE_SOURCE_DIR}/config/main.c.in ${tmp_dir}/main.c @ONLY)
configure_file(${CMAKE_SOURCE_DIR}/config/Makefile ${tmp_dir} COPYONLY)
+ configure_file(${CMAKE_SOURCE_DIR}/config/kbuild.mk ${tmp_dir} COPYONLY)
endfunction()
function(try_compile_prog_test)
set_conf_tmp_dir("" "")
execute_process(COMMAND make -C ${tmp_dir} KERNEL_DIR=${KERNEL_DIR}
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
- OUTPUT_QUIET ERROR_QUIET
+ #OUTPUT_QUIET
+ #ERROR_QUIET
RESULT_VARIABLE res)
if(res)
message(FATAL_ERROR "Conftest failure, kernel headers missing?")
diff -ruN efa-2.6.0/config/kbuild.mk efa-2.6.0/config/kbuild.mk
--- efa-2.6.0/config/kbuild.mk 1970-01-01 01:00:00.000000000 +0100
+++ efa-2.6.0/config/kbuild.mk 2024-07-05 21:31:20.005066674 +0200
@@ -0,0 +1,27 @@
+KVER ?= $(shell uname -r)
+MLNX_ARCH = $(shell uname -m)
+OFA_DIR ?= /usr/src/ofa_kernel/$(MLNX_ARCH)
+OFA := $(shell ( test -d $(OFA_DIR)/$(KVER) && echo $(OFA_DIR)/$(KVER) ) || ( test -d /var/lib/dkms/mlnx-ofed-kernel/ && ls -d /var/lib/dkms/mlnx-ofed-kernel/*/build ) || ( echo $(OFA_DIR) ))
+K_BUILD ?= /lib/modules/$(KVER)/build
+KERNEL_DIR ?= /lib/modules/$(KVER)/build
+KBUILD_EXTRA_SYMBOLS := $(OFA)/Module.symvers
+autoconf_h=$(shell /bin/ls -1 $(K_BUILD)/include/*/autoconf.h 2> /dev/null | head -1)
+kconfig_h=$(shell /bin/ls -1 $(K_BUILD)/include/*/kconfig.h 2> /dev/null | head -1)
+
+LINUXINCLUDE := \
+-include $(kconfig_h) \
+-include $(OFA)/include/linux/compat-2.6.h \
+-I$(PWD) \
+-I$(OFA)/include \
+-I$(OFA)/include/uapi \
+-I$$(srctree)/arch/$$(SRCARCH)/include \
+-Iarch/$$(SRCARCH)/include/generated \
+-Iinclude \
+-I$$(srctree)/arch/$$(SRCARCH)/include/uapi \
+-Iarch/$$(SRCARCH)/include/generated/uapi \
+-I$$(srctree)/include \
+-I$$(srctree)/include/uapi \
+-Iinclude/generated/uapi \
+$$(if $$(KBUILD_SRC),-Iinclude2 -I$$(srctree)/include) \
+-I$$(srctree)/arch/$$(SRCARCH)/include \
+-Iarch/$$(SRCARCH)/include/generated
diff -ruN efa-2.6.0/config/Makefile efa-2.6.0/config/Makefile
--- efa-2.6.0/config/Makefile 2024-07-05 21:40:59.769821663 +0200
+++ efa-2.6.0/config/Makefile 2024-07-05 21:31:20.321051825 +0200
@@ -7,7 +7,8 @@
obj-m += $(DRIVER_NAME).o
$(DRIVER_NAME)-objs := main.o
-KERNEL_DIR ?= /lib/modules/$(shell uname -r)/build
+obj ?= .
+include $(obj)/kbuild.mk
modules:
- $(MAKE) -C $(KERNEL_DIR) M=$(CURDIR) modules
+ $(MAKE) -C $(KERNEL_DIR) M=$(CURDIR) V=1 LINUXINCLUDE='$(LINUXINCLUDE)' modules
diff -ruN efa-2.6.0/dkms.conf efa-2.6.0/dkms.conf
--- efa-2.6.0/dkms.conf 2024-07-05 21:41:10.201331451 +0200
+++ efa-2.6.0/dkms.conf 2024-07-05 21:31:31.188541134 +0200
@@ -3,7 +3,7 @@
CLEAN="cd build; make modules_clean; make clean"
PRE_BUILD="./configure-dkms.sh $kernelver $source_tree"
# Quoted 'make' to suppress DKMS append of KERNELRELEASE
-MAKE="cd build; 'make'"
+MAKE="cd build; make VERBOSE=1"
BUILT_MODULE_NAME[0]="efa"
BUILT_MODULE_LOCATION="build/src/"
DEST_MODULE_LOCATION="/extra"
diff -ruN efa-2.6.0/src/build.mk efa-2.6.0/src/build.mk
--- efa-2.6.0/src/build.mk 1970-01-01 01:00:00.000000000 +0100
+++ efa-2.6.0/src/build.mk 2024-07-05 21:31:23.816887549 +0200
@@ -0,0 +1,4 @@
+include $(M)/kbuild.mk
+
+modules:
+ $(MAKE) -C $(KERNEL_DIR) M=$(M) V=1 LINUXINCLUDE='$(LINUXINCLUDE)' modules
diff -ruN efa-2.6.0/src/CMakeLists.txt efa-2.6.0/src/CMakeLists.txt
--- efa-2.6.0/src/CMakeLists.txt 2024-07-05 21:41:04.721588963 +0200
+++ efa-2.6.0/src/CMakeLists.txt 2024-07-05 21:31:25.680799959 +0200
@@ -13,6 +13,8 @@
string(REPLACE ";" " " efa_sources_string "${efa_sources}")
configure_file(Kbuild.in Kbuild @ONLY)
+configure_file(build.mk build.mk COPYONLY)
+configure_file(../config/kbuild.mk kbuild.mk COPYONLY)
foreach(src ${efa_sources})
configure_file(${src} ${src} COPYONLY)
@@ -25,14 +27,14 @@
message("-- Peer-to-peer memory enabled")
endif()
-set(module_cmd make -C ${KERNEL_DIR} M=${CMAKE_CURRENT_BINARY_DIR})
+set(module_cmd make -C ${CMAKE_CURRENT_BINARY_DIR} -f build.mk M=${CMAKE_CURRENT_BINARY_DIR})
if(GCOV_PROFILE)
set(module_cmd ${module_cmd} GCOV_PROFILE=y)
endif()
add_custom_command(OUTPUT efa.ko
COMMAND ${module_cmd} modules
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
- DEPENDS ${efa_sources} ${CMAKE_CURRENT_BINARY_DIR}/Kbuild ${CMAKE_CURRENT_BINARY_DIR}/config.h
+ DEPENDS ${efa_sources} ${CMAKE_CURRENT_BINARY_DIR}/Kbuild build.mk ${CMAKE_CURRENT_BINARY_DIR}/config.h ../config/kbuild.mk
VERBATIM)
add_custom_target(modules ALL DEPENDS efa.ko)
diff -ruN efa-2.6.0/src/Kbuild.in efa-2.6.0/src/Kbuild.in
--- efa-2.6.0/src/Kbuild.in 2024-07-05 21:41:01.813725613 +0200
+++ efa-2.6.0/src/Kbuild.in 2024-07-05 21:31:22.364955777 +0200
@@ -1,2 +1,4 @@
+include $(obj)/kbuild.mk
+KBUILD_EXTRA_SYMBOLS := $(OFA)/Module.symvers
obj-m := efa.o
efa-y := $(patsubst %.c,%.o, $(filter %.c, @efa_sources_string@)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Labels
bug
Report errors or unexpected behavior
Linux EFA driver
triage
Determine the priority and severity
Preliminary Actions
Driver Type
Linux kernel driver for Elastic Fabric Adapter (EFA)
Driver Tag/Commit
efa_2.6.0-1.amzn1_amd64
Custom Code
No
OS Platform and Distribution
5.15.0-2000-aws #2000 SMP Thu May 23 13:16:55 UTC 2024 x86_64 x86_64 x86_64 GNU/Linux
Stock Ubuntu kernel 5.15.0-1006 with
nvme
moved from builtins to external modules.Bug description
When built with
rdma
pointing at Mellanox distribution (i.e.,/usr/src/ofa_kernel/x86_64/5.15.0-2000-aws/include/rdma
belonging to https://network.nvidia.com/products/infiniband-drivers/linux/mlnx_ofed/),insmod efa
fails withThe reason why we are doing these weird culprits is because we want GPUDirect in nvme working together with EFA. Right now, we have to choose one of the two because of the described crash.
Reproduction steps
Expected Behavior
efa
inserts along with existingib_core
,ib_uverbs
, etc. from Mellanox.Actual Behavior
insmod
hangs.Additional Data
1. Region = us-east-2 2. Instance type = p5.48xlarge
Relevant log output
No response
Contact Details
vadim@poolside.ai
The text was updated successfully, but these errors were encountered: