Skip to content

Commit

Permalink
add OpenBLAS detection and modify tests codes (#40)
Browse files Browse the repository at this point in the history
* fix indents and commands for Haiku, and add OpenBLAS detection in src/CMakeLists.txt

* add system detection and add OpenBLAS detection

* change loop number by environment variable GGML_NLOOP or command line option

* change fmadd codes on no FMA support system

* change n_threads by environment variable GGML_NTHREADS or command line option

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
  • Loading branch information
katsu560 and ggerganov authored Mar 22, 2023
1 parent 2aed624 commit 434b8ea
Show file tree
Hide file tree
Showing 6 changed files with 226 additions and 19 deletions.
50 changes: 37 additions & 13 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ endif()
if (NOT UNAME_M)
execute_process(COMMAND uname -m OUTPUT_VARIABLE UNAME_M)
endif()
message(STATUS "UNAME_S: ${UNAME_S} UNAME_P: ${UNAME_P} UNAME_M: ${UNAME_M}")
#message(STATUS "UNAME_S: ${UNAME_S} UNAME_P: ${UNAME_P} UNAME_M: ${UNAME_M}")

# Mac OS + Arm can report x86_64
# ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789
Expand Down Expand Up @@ -68,7 +68,7 @@ else()
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfma")
endif()
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mf16c")
elseif (UNAME_S MATCHES "Linux")
elseif (UNAME_S MATCHES "Linux")
message(STATUS "Linux detected")
execute_process(COMMAND grep "avx " /proc/cpuinfo OUTPUT_VARIABLE AVX1_M)
if (AVX1_M MATCHES "avx")
Expand All @@ -90,26 +90,24 @@ elseif (UNAME_S MATCHES "Linux")
if (SSE3_M MATCHES "sse3")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -msse3")
endif()
message(STATUS "CMAKE_C_FLAGS: ${CMAKE_C_FLAGS}")
elseif (UNAME_S MATCHES "Haiku")
elseif (UNAME_S MATCHES "Haiku")
message(STATUS "Haiku detected")
execute_process(COMMAND bash -c "sysinfo -cpu | grep -w AVX" OUTPUT_VARIABLE AVX1_M)
if (AVX1_M MATCHES "AVX")
execute_process(COMMAND sysinfo -cpu COMMAND grep "AVX " OUTPUT_VARIABLE AVX1_M)
if (AVX1_M MATCHES "avx")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx")
endif()
execute_process(COMMAND bash -c "sysinfo -cpu | grep -w AVX2" OUTPUT_VARIABLE AVX2_M)
if (AVX2_M MATCHES "AVX2")
execute_process(COMMAND sysinfo -cpu COMMAND grep "AVX2 " OUTPUT_VARIABLE AVX2_M)
if (AVX2_M MATCHES "avx2")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx2")
endif()
execute_process(COMMAND bash -c "sysinfo -cpu | grep -w FMA" OUTPUT_VARIABLE FMA_M)
if (FMA_M MATCHES "FMA")
execute_process(COMMAND sysinfo -cpu COMMAND grep "FMA " OUTPUT_VARIABLE FMA_M)
if (FMA_M MATCHES "fma")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfma")
endif()
execute_process(COMMAND bash -c "sysinfo -cpu | grep -w F16C" OUTPUT_VARIABLE F16C_M)
if (F16C_M MATCHES "F16C")
execute_process(COMMAND sysinfo -cpu COMMAND grep "F16C " OUTPUT_VARIABLE F16C_M)
if (F16C_M MATCHES "f16c")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mf16c")
endif()
message(STATUS "CMAKE_C_FLAGS: ${CMAKE_C_FLAGS}")
else()
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfma -mf16c -mavx -mavx2")
endif()
Expand All @@ -133,6 +131,31 @@ if (APPLE AND NOT GGML_NO_ACCELERATE)
endif()
endif()

if (GGML_OPENBLAS)
set(OPENBLAS_INCLUDE_SEARCH_PATHS
/usr/include
/usr/include/openblas
/usr/include/openblas-base
/usr/local/include
/usr/local/include/openblas
/usr/local/include/openblas-base
/opt/OpenBLAS/include
$ENV{OpenBLAS_HOME}
$ENV{OpenBLAS_HOME}/include
)
find_path(OPENBLAS_INC NAMES cblas.h PATHS ${OPENBLAS_INCLUDE_SEARCH_PATHS})
find_library(OPENBLAS_LIB NAMES openblas libopenblas)
if (OPENBLAS_LIB)
message(STATUS "OpenBLAS found")

set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} ${OPENBLAS_LIB})
set(GGML_EXTRA_INCS ${GGML_EXTRA_INCS} ${OPENBLAS_INC})
set(GGML_EXTRA_FLAGS ${GGML_EXTRA_FLAGS} -DGGML_USE_OPENBLAS)
else()
message(WARNING "OpenBLAS not found")
endif()
endif()

if (GGML_PERF)
set(GGML_EXTRA_FLAGS ${GGML_EXTRA_FLAGS} -DGGML_PERF)
endif()
Expand All @@ -145,6 +168,7 @@ target_include_directories(${TARGET} PUBLIC
.
../include
../include/ggml
${GGML_EXTRA_INCS}
)

if (MSVC)
Expand Down
129 changes: 126 additions & 3 deletions tests/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,93 @@
# check systems
if (NOT UNAME_S)
execute_process(COMMAND uname -s OUTPUT_VARIABLE UNAME_S)
endif()
if (NOT UNAME_P)
execute_process(COMMAND uname -p OUTPUT_VARIABLE UNAME_P)
endif()
if (NOT UNAME_M)
execute_process(COMMAND uname -m OUTPUT_VARIABLE UNAME_M)
endif()
#message(STATUS "UNAME_S: ${UNAME_S} UNAME_P: ${UNAME_P} UNAME_M: ${UNAME_M}")

# Mac OS + Arm can report x86_64
# ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789
if (UNAME_S MATCHES "Darwin")
if (NOT UNAME_P MATCHES "arm")
execute_process(COMMAND sysctl -n hw.optional.arm64 OUTPUT_VARIABLE SYSCTL_M)
if (SYSCTL_M MATCHES "1")
#set(UNAME_P "arm")
#set(UNAME_M "arm64")
message(WARNING "Your arch is announced as x86_64, but it seems to actually be ARM64. Not fixing that can lea
d to bad performance. For more info see: https://github.com/ggerganov/whisper.cpp/issues/66\#issuecomment-#1282546789")
endif()
endif()
endif()

if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64")
message(STATUS "ARM detected")
#set(GGML_C_FLAGS "${GGML_C_FLAGS} -mcpu=apple-m1")
else()
message(STATUS "x86 detected")
#set(GGML_C_FLAGS "${GGML_C_FLAGS} -mavx -mavx2 -mfma -mf16c")
if (UNAME_S MATCHES "Darwin")
execute_process(COMMAND sysctl machdep.cpu.features OUTPUT_VARIABLE AVX1_M)
if (AVX1_M MATCHES "AVX1.0")
set(GGML_C_FLAGS "${GGML_C_FLAGS} -mavx")
endif()
execute_process(COMMAND sysctl machdep.cpu.leaf7_features OUTPUT_VARIABLE AVX2_M)
if (AVX2_M MATCHES "AVX2")
set(GGML_C_FLAGS "${GGML_C_FLAGS} -mavx2")
endif()
if (AVX1_M MATCHES "FMA")
set(GGML_C_FLAGS "${GGML_C_FLAGS} -mfma")
endif()
set(GGML_C_FLAGS "${GGML_C_FLAGS} -mf16c")
elseif (UNAME_S MATCHES "Linux")
message(STATUS "Linux detected")
execute_process(COMMAND grep "avx " /proc/cpuinfo OUTPUT_VARIABLE AVX1_M)
if (AVX1_M MATCHES "avx")
set(GGML_C_FLAGS "${GGML_C_FLAGS} -mavx")
endif()
execute_process(COMMAND grep "avx2 " /proc/cpuinfo OUTPUT_VARIABLE AVX2_M)
if (AVX2_M MATCHES "avx2")
set(GGML_C_FLAGS "${GGML_C_FLAGS} -mavx2")
endif()
execute_process(COMMAND grep "fma " /proc/cpuinfo OUTPUT_VARIABLE FMA_M)
if (FMA_M MATCHES "fma")
set(GGML_C_FLAGS "${GGML_C_FLAGS} -mfma")
endif()
execute_process(COMMAND grep "f16c " /proc/cpuinfo OUTPUT_VARIABLE F16C_M)
if (F16C_M MATCHES "f16c")
set(GGML_C_FLAGS "${GGML_C_FLAGS} -mf16c")
endif()
execute_process(COMMAND grep "sse3 " /proc/cpuinfo OUTPUT_VARIABLE SSE3_M)
if (SSE3_M MATCHES "sse3")
set(GGML_C_FLAGS "${GGML_C_FLAGS} -msse3")
endif()
elseif (UNAME_S MATCHES "Haiku")
message(STATUS "Haiku detected")
execute_process(COMMAND sysinfo -cpu COMMAND grep "AVX " OUTPUT_VARIABLE AVX1_M)
if (AVX1_M MATCHES "avx")
set(GGML_C_FLAGS "${GGML_C_FLAGS} -mavx")
endif()
execute_process(COMMAND sysinfo -cpu COMMAND grep "AVX2 " OUTPUT_VARIABLE AVX2_M)
if (AVX2_M MATCHES "avx2")
set(GGML_C_FLAGS "${GGML_C_FLAGS} -mavx2")
endif()
execute_process(COMMAND sysinfo -cpu COMMAND grep "FMA " OUTPUT_VARIABLE FMA_M)
if (FMA_M MATCHES "fma")
set(GGML_C_FLAGS "${GGML_C_FLAGS} -mfma")
endif()
execute_process(COMMAND sysinfo -cpu COMMAND grep "F16C " OUTPUT_VARIABLE F16C_M)
if (F16C_M MATCHES "f16c")
set(GGML_C_FLAGS "${GGML_C_FLAGS} -mf16c")
endif()
else()
set(GGML_C_FLAGS "${GGML_C_FLAGS} -mfma -mf16c -mavx -mavx2")
endif()
endif()

# on APPLE - include Accelerate framework
if (APPLE AND NOT GGML_NO_ACCELERATE)
find_library(ACCELERATE_FRAMEWORK Accelerate)
Expand All @@ -11,6 +101,31 @@ if (APPLE AND NOT GGML_NO_ACCELERATE)
endif()
endif()

if (GGML_OPENBLAS)
set(OPENBLAS_INCLUDE_SEARCH_PATHS
/usr/include
/usr/include/openblas
/usr/include/openblas-base
/usr/local/include
/usr/local/include/openblas
/usr/local/include/openblas-base
/opt/OpenBLAS/include
$ENV{OpenBLAS_HOME}
$ENV{OpenBLAS_HOME}/include
)
find_path(OPENBLAS_INC NAMES cblas.h PATHS ${OPENBLAS_INCLUDE_SEARCH_PATHS})
find_library(OPENBLAS_LIB NAMES openblas libopenblas)
if (OPENBLAS_LIB)
message(STATUS "OpenBLAS found")

set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} ${OPENBLAS_LIB})
set(GGML_EXTRA_INCS ${GGML_EXTRA_INCS} ${OPENBLAS_INC})
set(GGML_EXTRA_FLAGS ${GGML_EXTRA_FLAGS} -DGGML_USE_OPENBLAS)
else()
message(WARNING "OpenBLAS not found")
endif()
endif()

#
# test-vec0

Expand All @@ -26,7 +141,8 @@ if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "x86")
add_executable(${TEST_TARGET} ${TEST_TARGET}.c)
target_link_libraries(${TEST_TARGET} PRIVATE ggml)
add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
set_target_properties(${TEST_TARGET} PROPERTIES COMPILE_FLAGS "-mavx -mavx2 -mfma -mf16c")
#set_target_properties(${TEST_TARGET} PROPERTIES COMPILE_FLAGS "-mavx -mavx2 -mfma -mf16c")
set_target_properties(${TEST_TARGET} PROPERTIES COMPILE_FLAGS ${GGML_C_FLAGS})
endif()

#
Expand All @@ -51,7 +167,8 @@ add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)

set(TEST_TARGET test-mul-mat0)
add_executable(${TEST_TARGET} ${TEST_TARGET}.c)
target_link_libraries(${TEST_TARGET} PRIVATE ggml)
target_link_libraries(${TEST_TARGET} PRIVATE ggml ${GGML_EXTRA_LIBS})
target_compile_options(${TEST_TARGET} PRIVATE ${GGML_EXTRA_FLAGS})
add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)

#
Expand Down Expand Up @@ -106,13 +223,19 @@ target_link_libraries(${TEST_TARGET} PRIVATE ggml)
add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)

#
# test-svd0 (arm)
# test-svd0 (arm/x86)

if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" AND NOT GGML_NO_ACCELERATE)
set(TEST_TARGET test-svd0)
add_executable(${TEST_TARGET} ${TEST_TARGET}.c)
target_link_libraries(${TEST_TARGET} PRIVATE ggml ${GGML_EXTRA_LIBS})
target_compile_options(${TEST_TARGET} PRIVATE ${GGML_EXTRA_FLAGS})
add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "x86" AND GGML_OPENBLAS)
set(TEST_TARGET test-svd0)
add_executable(${TEST_TARGET} ${TEST_TARGET}.c)
target_link_libraries(${TEST_TARGET} PRIVATE ggml ${GGML_EXTRA_LIBS})
target_compile_options(${TEST_TARGET} PRIVATE ${GGML_EXTRA_FLAGS})
add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
endif()

12 changes: 11 additions & 1 deletion tests/test-grad0.c
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,17 @@ int main(int argc, const char ** argv) {

int ne[4];

for (int iter = 0; iter < 1000; ++iter) {
// original loop: 1000
int niter = 1000;
const char *env = getenv("GGML_NLOOP");
if (env != NULL) {
niter = atoi(env);
}
if (argc > 1) {
niter = atoi(argv[1]);
}
for (int iter = 0; iter < niter; ++iter) {
printf("test-grad0: iter:%d/%d\n", iter, niter);
struct ggml_context * ctx0 = ggml_init(params);

get_random_dims(ne, 4);
Expand Down
12 changes: 11 additions & 1 deletion tests/test-mul-mat0.c
Original file line number Diff line number Diff line change
Expand Up @@ -232,7 +232,17 @@ int main(int argc, const char ** argv) {

int ne[4];

for (int iter = 0; iter < 500; ++iter) {
// original loop: 500
int niter = 500;
const char *env = getenv("GGML_NLOOP");
if (env != NULL) {
niter = atoi(env);
}
if (argc > 1) {
niter = atoi(argv[1]);
}
for (int iter = 0; iter < niter; ++iter) {
printf("test-mul-mat0: iter:%d/%d\n", iter, niter);
struct ggml_context * ctx0 = ggml_init(params);

get_random_dims(ne, 4);
Expand Down
30 changes: 30 additions & 0 deletions tests/test-vec1.c
Original file line number Diff line number Diff line change
Expand Up @@ -97,10 +97,17 @@ void mul_mat_vec_f32_2(
__m256 b1 = _mm256_loadu_ps(src1 + j + 8);
__m256 b2 = _mm256_loadu_ps(src1 + j + 16);
__m256 b3 = _mm256_loadu_ps(src1 + j + 24);
#if defined(__FMA__)
sum0 = _mm256_fmadd_ps(a0, b0, sum0);
sum1 = _mm256_fmadd_ps(a1, b1, sum1);
sum2 = _mm256_fmadd_ps(a2, b2, sum2);
sum3 = _mm256_fmadd_ps(a3, b3, sum3);
#else
sum0 = _mm256_add_ps(_mm256_mul_ps(a0, b0), sum0);
sum1 = _mm256_add_ps(_mm256_mul_ps(a1, b1), sum1);
sum2 = _mm256_add_ps(_mm256_mul_ps(a2, b2), sum2);
sum3 = _mm256_add_ps(_mm256_mul_ps(a3, b3), sum3);
#endif
}
dst[i] = reduce_vector8_0(_mm256_add_ps(_mm256_add_ps(sum0, sum1), _mm256_add_ps(sum2, sum3)));

Expand Down Expand Up @@ -314,7 +321,11 @@ void mul_mat_vec_f16_0(
for (int j = 0; j < ncols8; j += 8) {
__m256 a = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(src0_row + j)));
__m256 b = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(src1 + j)));
#if defined(__FMA__)
sum = _mm256_fmadd_ps(a, b, sum);
#else
sum = _mm256_add_ps(_mm256_mul_ps(a, b), sum);
#endif
}
dst[i] = reduce_vector8_0(sum);

Expand Down Expand Up @@ -343,8 +354,13 @@ void mul_mat_vec_f16_1(
__m256 a1 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(src0_row + j + 8)));
__m256 b0 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(src1 + j)));
__m256 b1 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(src1 + j + 8)));
#if defined(__FMA__)
sum0 = _mm256_fmadd_ps(a0, b0, sum0);
sum1 = _mm256_fmadd_ps(a1, b1, sum1);
#else
sum0 = _mm256_add_ps(_mm256_mul_ps(a0, b0), sum0);
sum1 = _mm256_add_ps(_mm256_mul_ps(a1, b1), sum1);
#endif
}
dst[i] = reduce_vector8_0(sum0) + reduce_vector8_0(sum1);

Expand Down Expand Up @@ -379,10 +395,17 @@ void mul_mat_vec_f16_2(
__m256 b1 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(src1 + j + 8)));
__m256 b2 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(src1 + j + 16)));
__m256 b3 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(src1 + j + 24)));
#if defined(__FMA__)
sum0 = _mm256_fmadd_ps(a0, b0, sum0);
sum1 = _mm256_fmadd_ps(a1, b1, sum1);
sum2 = _mm256_fmadd_ps(a2, b2, sum2);
sum3 = _mm256_fmadd_ps(a3, b3, sum3);
#else
sum0 = _mm256_add_ps(_mm256_mul_ps(a0, b0), sum0);
sum1 = _mm256_add_ps(_mm256_mul_ps(a1, b1), sum1);
sum2 = _mm256_add_ps(_mm256_mul_ps(a2, b2), sum2);
sum3 = _mm256_add_ps(_mm256_mul_ps(a3, b3), sum3);
#endif
}
dst[i] = reduce_vector8_0(sum0) + reduce_vector8_0(sum1) + reduce_vector8_0(sum2) + reduce_vector8_0(sum3);

Expand Down Expand Up @@ -417,10 +440,17 @@ void mul_mat_vec_f16_3(
__m256 b1 = _mm256_loadu_ps(src1 + j + 8);
__m256 b2 = _mm256_loadu_ps(src1 + j + 16);
__m256 b3 = _mm256_loadu_ps(src1 + j + 24);
#if defined(__FMA__)
sum0 = _mm256_fmadd_ps(a0, b0, sum0);
sum1 = _mm256_fmadd_ps(a1, b1, sum1);
sum2 = _mm256_fmadd_ps(a2, b2, sum2);
sum3 = _mm256_fmadd_ps(a3, b3, sum3);
#else
sum0 = _mm256_add_ps(_mm256_mul_ps(a0, b0), sum0);
sum1 = _mm256_add_ps(_mm256_mul_ps(a1, b1), sum1);
sum2 = _mm256_add_ps(_mm256_mul_ps(a2, b2), sum2);
sum3 = _mm256_add_ps(_mm256_mul_ps(a3, b3), sum3);
#endif
}
dst[i] = reduce_vector8_0(sum0) + reduce_vector8_0(sum1) + reduce_vector8_0(sum2) + reduce_vector8_0(sum3);

Expand Down
12 changes: 11 additions & 1 deletion tests/test2.c
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,17 @@ int main(int argc, const char ** argv) {
struct ggml_opt_params opt_params = ggml_opt_default_params(GGML_OPT_ADAM);
opt_params.adam.alpha = 0.01f;

opt_params.n_threads = (argc > 1) ? atoi(argv[1]) : 8;
// original threads: 8
int nthreads = 8;
const char *env = getenv("GGML_NTHREADS");
if (env != NULL) {
nthreads = atoi(env);
}
if (argc > 1) {
nthreads = atoi(argv[1]);
}
opt_params.n_threads = nthreads;
printf("test2: n_threads:%d\n", opt_params.n_threads);

const float xi[] = { 1.0f, 2.0f, 3.0f, 4.0f, 5.0f , 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, };
float yi[] = { 15.0f, 25.0f, 35.0f, 45.0f, 55.0f, 65.0f, 75.0f, 85.0f, 95.0f, 105.0f, };
Expand Down

0 comments on commit 434b8ea

Please sign in to comment.