Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add OpenBLAS detection and modify tests codes #40

Merged
merged 6 commits into from
Mar 22, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 37 additions & 13 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ endif()
if (NOT UNAME_M)
execute_process(COMMAND uname -m OUTPUT_VARIABLE UNAME_M)
endif()
message(STATUS "UNAME_S: ${UNAME_S} UNAME_P: ${UNAME_P} UNAME_M: ${UNAME_M}")
#message(STATUS "UNAME_S: ${UNAME_S} UNAME_P: ${UNAME_P} UNAME_M: ${UNAME_M}")

# Mac OS + Arm can report x86_64
# ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789
Expand Down Expand Up @@ -68,7 +68,7 @@ else()
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfma")
endif()
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mf16c")
elseif (UNAME_S MATCHES "Linux")
elseif (UNAME_S MATCHES "Linux")
message(STATUS "Linux detected")
execute_process(COMMAND grep "avx " /proc/cpuinfo OUTPUT_VARIABLE AVX1_M)
if (AVX1_M MATCHES "avx")
Expand All @@ -90,26 +90,24 @@ elseif (UNAME_S MATCHES "Linux")
if (SSE3_M MATCHES "sse3")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -msse3")
endif()
message(STATUS "CMAKE_C_FLAGS: ${CMAKE_C_FLAGS}")
elseif (UNAME_S MATCHES "Haiku")
elseif (UNAME_S MATCHES "Haiku")
message(STATUS "Haiku detected")
execute_process(COMMAND bash -c "sysinfo -cpu | grep -w AVX" OUTPUT_VARIABLE AVX1_M)
if (AVX1_M MATCHES "AVX")
execute_process(COMMAND sysinfo -cpu COMMAND grep "AVX " OUTPUT_VARIABLE AVX1_M)
if (AVX1_M MATCHES "avx")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx")
endif()
execute_process(COMMAND bash -c "sysinfo -cpu | grep -w AVX2" OUTPUT_VARIABLE AVX2_M)
if (AVX2_M MATCHES "AVX2")
execute_process(COMMAND sysinfo -cpu COMMAND grep "AVX2 " OUTPUT_VARIABLE AVX2_M)
if (AVX2_M MATCHES "avx2")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx2")
endif()
execute_process(COMMAND bash -c "sysinfo -cpu | grep -w FMA" OUTPUT_VARIABLE FMA_M)
if (FMA_M MATCHES "FMA")
execute_process(COMMAND sysinfo -cpu COMMAND grep "FMA " OUTPUT_VARIABLE FMA_M)
if (FMA_M MATCHES "fma")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfma")
endif()
execute_process(COMMAND bash -c "sysinfo -cpu | grep -w F16C" OUTPUT_VARIABLE F16C_M)
if (F16C_M MATCHES "F16C")
execute_process(COMMAND sysinfo -cpu COMMAND grep "F16C " OUTPUT_VARIABLE F16C_M)
if (F16C_M MATCHES "f16c")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mf16c")
endif()
message(STATUS "CMAKE_C_FLAGS: ${CMAKE_C_FLAGS}")
else()
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfma -mf16c -mavx -mavx2")
endif()
Expand All @@ -133,6 +131,31 @@ if (APPLE AND NOT GGML_NO_ACCELERATE)
endif()
endif()

if (GGML_OPENBLAS)
set(OPENBLAS_INCLUDE_SEARCH_PATHS
/usr/include
/usr/include/openblas
/usr/include/openblas-base
/usr/local/include
/usr/local/include/openblas
/usr/local/include/openblas-base
/opt/OpenBLAS/include
$ENV{OpenBLAS_HOME}
$ENV{OpenBLAS_HOME}/include
)
find_path(OPENBLAS_INC NAMES cblas.h PATHS ${OPENBLAS_INCLUDE_SEARCH_PATHS})
find_library(OPENBLAS_LIB NAMES openblas libopenblas)
if (OPENBLAS_LIB)
message(STATUS "OpenBLAS found")

set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} ${OPENBLAS_LIB})
set(GGML_EXTRA_INCS ${GGML_EXTRA_INCS} ${OPENBLAS_INC})
set(GGML_EXTRA_FLAGS ${GGML_EXTRA_FLAGS} -DGGML_USE_OPENBLAS)
else()
message(WARNING "OpenBLAS not found")
endif()
endif()

if (GGML_PERF)
set(GGML_EXTRA_FLAGS ${GGML_EXTRA_FLAGS} -DGGML_PERF)
endif()
Expand All @@ -145,6 +168,7 @@ target_include_directories(${TARGET} PUBLIC
.
../include
../include/ggml
${GGML_EXTRA_INCS}
)

if (MSVC)
Expand Down
129 changes: 126 additions & 3 deletions tests/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,93 @@
# check systems
if (NOT UNAME_S)
execute_process(COMMAND uname -s OUTPUT_VARIABLE UNAME_S)
endif()
if (NOT UNAME_P)
execute_process(COMMAND uname -p OUTPUT_VARIABLE UNAME_P)
endif()
if (NOT UNAME_M)
execute_process(COMMAND uname -m OUTPUT_VARIABLE UNAME_M)
endif()
#message(STATUS "UNAME_S: ${UNAME_S} UNAME_P: ${UNAME_P} UNAME_M: ${UNAME_M}")

# Mac OS + Arm can report x86_64
# ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789
if (UNAME_S MATCHES "Darwin")
if (NOT UNAME_P MATCHES "arm")
execute_process(COMMAND sysctl -n hw.optional.arm64 OUTPUT_VARIABLE SYSCTL_M)
if (SYSCTL_M MATCHES "1")
#set(UNAME_P "arm")
#set(UNAME_M "arm64")
message(WARNING "Your arch is announced as x86_64, but it seems to actually be ARM64. Not fixing that can lea
d to bad performance. For more info see: https://github.com/ggerganov/whisper.cpp/issues/66\#issuecomment-#1282546789")
endif()
endif()
endif()

if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64")
message(STATUS "ARM detected")
#set(GGML_C_FLAGS "${GGML_C_FLAGS} -mcpu=apple-m1")
else()
message(STATUS "x86 detected")
#set(GGML_C_FLAGS "${GGML_C_FLAGS} -mavx -mavx2 -mfma -mf16c")
if (UNAME_S MATCHES "Darwin")
execute_process(COMMAND sysctl machdep.cpu.features OUTPUT_VARIABLE AVX1_M)
if (AVX1_M MATCHES "AVX1.0")
set(GGML_C_FLAGS "${GGML_C_FLAGS} -mavx")
endif()
execute_process(COMMAND sysctl machdep.cpu.leaf7_features OUTPUT_VARIABLE AVX2_M)
if (AVX2_M MATCHES "AVX2")
set(GGML_C_FLAGS "${GGML_C_FLAGS} -mavx2")
endif()
if (AVX1_M MATCHES "FMA")
set(GGML_C_FLAGS "${GGML_C_FLAGS} -mfma")
endif()
set(GGML_C_FLAGS "${GGML_C_FLAGS} -mf16c")
elseif (UNAME_S MATCHES "Linux")
message(STATUS "Linux detected")
execute_process(COMMAND grep "avx " /proc/cpuinfo OUTPUT_VARIABLE AVX1_M)
if (AVX1_M MATCHES "avx")
set(GGML_C_FLAGS "${GGML_C_FLAGS} -mavx")
endif()
execute_process(COMMAND grep "avx2 " /proc/cpuinfo OUTPUT_VARIABLE AVX2_M)
if (AVX2_M MATCHES "avx2")
set(GGML_C_FLAGS "${GGML_C_FLAGS} -mavx2")
endif()
execute_process(COMMAND grep "fma " /proc/cpuinfo OUTPUT_VARIABLE FMA_M)
if (FMA_M MATCHES "fma")
set(GGML_C_FLAGS "${GGML_C_FLAGS} -mfma")
endif()
execute_process(COMMAND grep "f16c " /proc/cpuinfo OUTPUT_VARIABLE F16C_M)
if (F16C_M MATCHES "f16c")
set(GGML_C_FLAGS "${GGML_C_FLAGS} -mf16c")
endif()
execute_process(COMMAND grep "sse3 " /proc/cpuinfo OUTPUT_VARIABLE SSE3_M)
if (SSE3_M MATCHES "sse3")
set(GGML_C_FLAGS "${GGML_C_FLAGS} -msse3")
endif()
elseif (UNAME_S MATCHES "Haiku")
message(STATUS "Haiku detected")
execute_process(COMMAND sysinfo -cpu COMMAND grep "AVX " OUTPUT_VARIABLE AVX1_M)
if (AVX1_M MATCHES "avx")
set(GGML_C_FLAGS "${GGML_C_FLAGS} -mavx")
endif()
execute_process(COMMAND sysinfo -cpu COMMAND grep "AVX2 " OUTPUT_VARIABLE AVX2_M)
if (AVX2_M MATCHES "avx2")
set(GGML_C_FLAGS "${GGML_C_FLAGS} -mavx2")
endif()
execute_process(COMMAND sysinfo -cpu COMMAND grep "FMA " OUTPUT_VARIABLE FMA_M)
if (FMA_M MATCHES "fma")
set(GGML_C_FLAGS "${GGML_C_FLAGS} -mfma")
endif()
execute_process(COMMAND sysinfo -cpu COMMAND grep "F16C " OUTPUT_VARIABLE F16C_M)
if (F16C_M MATCHES "f16c")
set(GGML_C_FLAGS "${GGML_C_FLAGS} -mf16c")
endif()
else()
set(GGML_C_FLAGS "${GGML_C_FLAGS} -mfma -mf16c -mavx -mavx2")
endif()
endif()

# on APPLE - include Accelerate framework
if (APPLE AND NOT GGML_NO_ACCELERATE)
find_library(ACCELERATE_FRAMEWORK Accelerate)
Expand All @@ -11,6 +101,31 @@ if (APPLE AND NOT GGML_NO_ACCELERATE)
endif()
endif()

if (GGML_OPENBLAS)
set(OPENBLAS_INCLUDE_SEARCH_PATHS
/usr/include
/usr/include/openblas
/usr/include/openblas-base
/usr/local/include
/usr/local/include/openblas
/usr/local/include/openblas-base
/opt/OpenBLAS/include
$ENV{OpenBLAS_HOME}
$ENV{OpenBLAS_HOME}/include
)
find_path(OPENBLAS_INC NAMES cblas.h PATHS ${OPENBLAS_INCLUDE_SEARCH_PATHS})
find_library(OPENBLAS_LIB NAMES openblas libopenblas)
if (OPENBLAS_LIB)
message(STATUS "OpenBLAS found")

set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} ${OPENBLAS_LIB})
set(GGML_EXTRA_INCS ${GGML_EXTRA_INCS} ${OPENBLAS_INC})
set(GGML_EXTRA_FLAGS ${GGML_EXTRA_FLAGS} -DGGML_USE_OPENBLAS)
else()
message(WARNING "OpenBLAS not found")
endif()
endif()

#
# test-vec0

Expand All @@ -26,7 +141,8 @@ if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "x86")
add_executable(${TEST_TARGET} ${TEST_TARGET}.c)
target_link_libraries(${TEST_TARGET} PRIVATE ggml)
add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
set_target_properties(${TEST_TARGET} PROPERTIES COMPILE_FLAGS "-mavx -mavx2 -mfma -mf16c")
#set_target_properties(${TEST_TARGET} PROPERTIES COMPILE_FLAGS "-mavx -mavx2 -mfma -mf16c")
set_target_properties(${TEST_TARGET} PROPERTIES COMPILE_FLAGS ${GGML_C_FLAGS})
endif()

#
Expand All @@ -51,7 +167,8 @@ add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)

set(TEST_TARGET test-mul-mat0)
add_executable(${TEST_TARGET} ${TEST_TARGET}.c)
target_link_libraries(${TEST_TARGET} PRIVATE ggml)
target_link_libraries(${TEST_TARGET} PRIVATE ggml ${GGML_EXTRA_LIBS})
target_compile_options(${TEST_TARGET} PRIVATE ${GGML_EXTRA_FLAGS})
add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)

#
Expand Down Expand Up @@ -106,13 +223,19 @@ target_link_libraries(${TEST_TARGET} PRIVATE ggml)
add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)

#
# test-svd0 (arm)
# test-svd0 (arm/x86)

if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" AND NOT GGML_NO_ACCELERATE)
set(TEST_TARGET test-svd0)
add_executable(${TEST_TARGET} ${TEST_TARGET}.c)
target_link_libraries(${TEST_TARGET} PRIVATE ggml ${GGML_EXTRA_LIBS})
target_compile_options(${TEST_TARGET} PRIVATE ${GGML_EXTRA_FLAGS})
add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "x86" AND GGML_OPENBLAS)
set(TEST_TARGET test-svd0)
add_executable(${TEST_TARGET} ${TEST_TARGET}.c)
target_link_libraries(${TEST_TARGET} PRIVATE ggml ${GGML_EXTRA_LIBS})
target_compile_options(${TEST_TARGET} PRIVATE ${GGML_EXTRA_FLAGS})
add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
endif()

12 changes: 11 additions & 1 deletion tests/test-grad0.c
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,17 @@ int main(int argc, const char ** argv) {

int ne[4];

for (int iter = 0; iter < 1000; ++iter) {
// original loop: 1000
int niter = 1000;
const char *env = getenv("GGML_NLOOP");
if (env != NULL) {
niter = atoi(env);
}
if (argc > 1) {
niter = atoi(argv[1]);
}
for (int iter = 0; iter < niter; ++iter) {
printf("test-grad0: iter:%d/%d\n", iter, niter);
struct ggml_context * ctx0 = ggml_init(params);

get_random_dims(ne, 4);
Expand Down
12 changes: 11 additions & 1 deletion tests/test-mul-mat0.c
Original file line number Diff line number Diff line change
Expand Up @@ -232,7 +232,17 @@ int main(int argc, const char ** argv) {

int ne[4];

for (int iter = 0; iter < 500; ++iter) {
// original loop: 500
int niter = 500;
const char *env = getenv("GGML_NLOOP");
if (env != NULL) {
niter = atoi(env);
}
if (argc > 1) {
niter = atoi(argv[1]);
}
for (int iter = 0; iter < niter; ++iter) {
printf("test-mul-mat0: iter:%d/%d\n", iter, niter);
struct ggml_context * ctx0 = ggml_init(params);

get_random_dims(ne, 4);
Expand Down
30 changes: 30 additions & 0 deletions tests/test-vec1.c
Original file line number Diff line number Diff line change
Expand Up @@ -97,10 +97,17 @@ void mul_mat_vec_f32_2(
__m256 b1 = _mm256_loadu_ps(src1 + j + 8);
__m256 b2 = _mm256_loadu_ps(src1 + j + 16);
__m256 b3 = _mm256_loadu_ps(src1 + j + 24);
#if defined(__FMA__)
sum0 = _mm256_fmadd_ps(a0, b0, sum0);
sum1 = _mm256_fmadd_ps(a1, b1, sum1);
sum2 = _mm256_fmadd_ps(a2, b2, sum2);
sum3 = _mm256_fmadd_ps(a3, b3, sum3);
#else
sum0 = _mm256_add_ps(_mm256_mul_ps(a0, b0), sum0);
sum1 = _mm256_add_ps(_mm256_mul_ps(a1, b1), sum1);
sum2 = _mm256_add_ps(_mm256_mul_ps(a2, b2), sum2);
sum3 = _mm256_add_ps(_mm256_mul_ps(a3, b3), sum3);
#endif
}
dst[i] = reduce_vector8_0(_mm256_add_ps(_mm256_add_ps(sum0, sum1), _mm256_add_ps(sum2, sum3)));

Expand Down Expand Up @@ -314,7 +321,11 @@ void mul_mat_vec_f16_0(
for (int j = 0; j < ncols8; j += 8) {
__m256 a = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(src0_row + j)));
__m256 b = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(src1 + j)));
#if defined(__FMA__)
sum = _mm256_fmadd_ps(a, b, sum);
#else
sum = _mm256_add_ps(_mm256_mul_ps(a, b), sum);
#endif
}
dst[i] = reduce_vector8_0(sum);

Expand Down Expand Up @@ -343,8 +354,13 @@ void mul_mat_vec_f16_1(
__m256 a1 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(src0_row + j + 8)));
__m256 b0 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(src1 + j)));
__m256 b1 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(src1 + j + 8)));
#if defined(__FMA__)
sum0 = _mm256_fmadd_ps(a0, b0, sum0);
sum1 = _mm256_fmadd_ps(a1, b1, sum1);
#else
sum0 = _mm256_add_ps(_mm256_mul_ps(a0, b0), sum0);
sum1 = _mm256_add_ps(_mm256_mul_ps(a1, b1), sum1);
#endif
}
dst[i] = reduce_vector8_0(sum0) + reduce_vector8_0(sum1);

Expand Down Expand Up @@ -379,10 +395,17 @@ void mul_mat_vec_f16_2(
__m256 b1 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(src1 + j + 8)));
__m256 b2 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(src1 + j + 16)));
__m256 b3 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(src1 + j + 24)));
#if defined(__FMA__)
sum0 = _mm256_fmadd_ps(a0, b0, sum0);
sum1 = _mm256_fmadd_ps(a1, b1, sum1);
sum2 = _mm256_fmadd_ps(a2, b2, sum2);
sum3 = _mm256_fmadd_ps(a3, b3, sum3);
#else
sum0 = _mm256_add_ps(_mm256_mul_ps(a0, b0), sum0);
sum1 = _mm256_add_ps(_mm256_mul_ps(a1, b1), sum1);
sum2 = _mm256_add_ps(_mm256_mul_ps(a2, b2), sum2);
sum3 = _mm256_add_ps(_mm256_mul_ps(a3, b3), sum3);
#endif
}
dst[i] = reduce_vector8_0(sum0) + reduce_vector8_0(sum1) + reduce_vector8_0(sum2) + reduce_vector8_0(sum3);

Expand Down Expand Up @@ -417,10 +440,17 @@ void mul_mat_vec_f16_3(
__m256 b1 = _mm256_loadu_ps(src1 + j + 8);
__m256 b2 = _mm256_loadu_ps(src1 + j + 16);
__m256 b3 = _mm256_loadu_ps(src1 + j + 24);
#if defined(__FMA__)
sum0 = _mm256_fmadd_ps(a0, b0, sum0);
sum1 = _mm256_fmadd_ps(a1, b1, sum1);
sum2 = _mm256_fmadd_ps(a2, b2, sum2);
sum3 = _mm256_fmadd_ps(a3, b3, sum3);
#else
sum0 = _mm256_add_ps(_mm256_mul_ps(a0, b0), sum0);
sum1 = _mm256_add_ps(_mm256_mul_ps(a1, b1), sum1);
sum2 = _mm256_add_ps(_mm256_mul_ps(a2, b2), sum2);
sum3 = _mm256_add_ps(_mm256_mul_ps(a3, b3), sum3);
#endif
}
dst[i] = reduce_vector8_0(sum0) + reduce_vector8_0(sum1) + reduce_vector8_0(sum2) + reduce_vector8_0(sum3);

Expand Down
12 changes: 11 additions & 1 deletion tests/test2.c
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,17 @@ int main(int argc, const char ** argv) {
struct ggml_opt_params opt_params = ggml_opt_default_params(GGML_OPT_ADAM);
opt_params.adam.alpha = 0.01f;

opt_params.n_threads = (argc > 1) ? atoi(argv[1]) : 8;
// original threads: 8
int nthreads = 8;
const char *env = getenv("GGML_NTHREADS");
if (env != NULL) {
nthreads = atoi(env);
}
if (argc > 1) {
nthreads = atoi(argv[1]);
}
opt_params.n_threads = nthreads;
printf("test2: n_threads:%d\n", opt_params.n_threads);

const float xi[] = { 1.0f, 2.0f, 3.0f, 4.0f, 5.0f , 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, };
float yi[] = { 15.0f, 25.0f, 35.0f, 45.0f, 55.0f, 65.0f, 75.0f, 85.0f, 95.0f, 105.0f, };
Expand Down