diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 73658a5fa..a97050044 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -34,7 +34,7 @@ endif() if (NOT UNAME_M) execute_process(COMMAND uname -m OUTPUT_VARIABLE UNAME_M) endif() -message(STATUS "UNAME_S: ${UNAME_S} UNAME_P: ${UNAME_P} UNAME_M: ${UNAME_M}") +#message(STATUS "UNAME_S: ${UNAME_S} UNAME_P: ${UNAME_P} UNAME_M: ${UNAME_M}") # Mac OS + Arm can report x86_64 # ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789 @@ -68,7 +68,7 @@ else() set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfma") endif() set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mf16c") -elseif (UNAME_S MATCHES "Linux") + elseif (UNAME_S MATCHES "Linux") message(STATUS "Linux detected") execute_process(COMMAND grep "avx " /proc/cpuinfo OUTPUT_VARIABLE AVX1_M) if (AVX1_M MATCHES "avx") @@ -90,26 +90,24 @@ elseif (UNAME_S MATCHES "Linux") if (SSE3_M MATCHES "sse3") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -msse3") endif() - message(STATUS "CMAKE_C_FLAGS: ${CMAKE_C_FLAGS}") -elseif (UNAME_S MATCHES "Haiku") + elseif (UNAME_S MATCHES "Haiku") message(STATUS "Haiku detected") - execute_process(COMMAND bash -c "sysinfo -cpu | grep -w AVX" OUTPUT_VARIABLE AVX1_M) - if (AVX1_M MATCHES "AVX") + execute_process(COMMAND sysinfo -cpu COMMAND grep "AVX " OUTPUT_VARIABLE AVX1_M) + if (AVX1_M MATCHES "avx") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx") endif() - execute_process(COMMAND bash -c "sysinfo -cpu | grep -w AVX2" OUTPUT_VARIABLE AVX2_M) - if (AVX2_M MATCHES "AVX2") + execute_process(COMMAND sysinfo -cpu COMMAND grep "AVX2 " OUTPUT_VARIABLE AVX2_M) + if (AVX2_M MATCHES "avx2") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx2") endif() - execute_process(COMMAND bash -c "sysinfo -cpu | grep -w FMA" OUTPUT_VARIABLE FMA_M) - if (FMA_M MATCHES "FMA") + execute_process(COMMAND sysinfo -cpu COMMAND grep "FMA " OUTPUT_VARIABLE FMA_M) + if (FMA_M MATCHES "fma") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfma") endif() - execute_process(COMMAND bash -c "sysinfo -cpu | grep -w F16C" OUTPUT_VARIABLE F16C_M) - if (F16C_M MATCHES "F16C") + execute_process(COMMAND sysinfo -cpu COMMAND grep "F16C " OUTPUT_VARIABLE F16C_M) + if (F16C_M MATCHES "f16c") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mf16c") endif() - message(STATUS "CMAKE_C_FLAGS: ${CMAKE_C_FLAGS}") else() set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfma -mf16c -mavx -mavx2") endif() @@ -133,6 +131,31 @@ if (APPLE AND NOT GGML_NO_ACCELERATE) endif() endif() +if (GGML_OPENBLAS) + set(OPENBLAS_INCLUDE_SEARCH_PATHS + /usr/include + /usr/include/openblas + /usr/include/openblas-base + /usr/local/include + /usr/local/include/openblas + /usr/local/include/openblas-base + /opt/OpenBLAS/include + $ENV{OpenBLAS_HOME} + $ENV{OpenBLAS_HOME}/include + ) + find_path(OPENBLAS_INC NAMES cblas.h PATHS ${OPENBLAS_INCLUDE_SEARCH_PATHS}) + find_library(OPENBLAS_LIB NAMES openblas libopenblas) + if (OPENBLAS_LIB) + message(STATUS "OpenBLAS found") + + set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} ${OPENBLAS_LIB}) + set(GGML_EXTRA_INCS ${GGML_EXTRA_INCS} ${OPENBLAS_INC}) + set(GGML_EXTRA_FLAGS ${GGML_EXTRA_FLAGS} -DGGML_USE_OPENBLAS) + else() + message(WARNING "OpenBLAS not found") + endif() +endif() + if (GGML_PERF) set(GGML_EXTRA_FLAGS ${GGML_EXTRA_FLAGS} -DGGML_PERF) endif() @@ -145,6 +168,7 @@ target_include_directories(${TARGET} PUBLIC . ../include ../include/ggml + ${GGML_EXTRA_INCS} ) if (MSVC) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 856cc3ed8..ac7039fea 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -1,3 +1,93 @@ +# check systems +if (NOT UNAME_S) + execute_process(COMMAND uname -s OUTPUT_VARIABLE UNAME_S) +endif() +if (NOT UNAME_P) + execute_process(COMMAND uname -p OUTPUT_VARIABLE UNAME_P) +endif() +if (NOT UNAME_M) + execute_process(COMMAND uname -m OUTPUT_VARIABLE UNAME_M) +endif() +#message(STATUS "UNAME_S: ${UNAME_S} UNAME_P: ${UNAME_P} UNAME_M: ${UNAME_M}") + +# Mac OS + Arm can report x86_64 +# ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789 +if (UNAME_S MATCHES "Darwin") + if (NOT UNAME_P MATCHES "arm") + execute_process(COMMAND sysctl -n hw.optional.arm64 OUTPUT_VARIABLE SYSCTL_M) + if (SYSCTL_M MATCHES "1") + #set(UNAME_P "arm") + #set(UNAME_M "arm64") + message(WARNING "Your arch is announced as x86_64, but it seems to actually be ARM64. Not fixing that can lea +d to bad performance. For more info see: https://github.com/ggerganov/whisper.cpp/issues/66\#issuecomment-#1282546789") + endif() + endif() +endif() + +if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64") + message(STATUS "ARM detected") + #set(GGML_C_FLAGS "${GGML_C_FLAGS} -mcpu=apple-m1") +else() + message(STATUS "x86 detected") + #set(GGML_C_FLAGS "${GGML_C_FLAGS} -mavx -mavx2 -mfma -mf16c") + if (UNAME_S MATCHES "Darwin") + execute_process(COMMAND sysctl machdep.cpu.features OUTPUT_VARIABLE AVX1_M) + if (AVX1_M MATCHES "AVX1.0") + set(GGML_C_FLAGS "${GGML_C_FLAGS} -mavx") + endif() + execute_process(COMMAND sysctl machdep.cpu.leaf7_features OUTPUT_VARIABLE AVX2_M) + if (AVX2_M MATCHES "AVX2") + set(GGML_C_FLAGS "${GGML_C_FLAGS} -mavx2") + endif() + if (AVX1_M MATCHES "FMA") + set(GGML_C_FLAGS "${GGML_C_FLAGS} -mfma") + endif() + set(GGML_C_FLAGS "${GGML_C_FLAGS} -mf16c") + elseif (UNAME_S MATCHES "Linux") + message(STATUS "Linux detected") + execute_process(COMMAND grep "avx " /proc/cpuinfo OUTPUT_VARIABLE AVX1_M) + if (AVX1_M MATCHES "avx") + set(GGML_C_FLAGS "${GGML_C_FLAGS} -mavx") + endif() + execute_process(COMMAND grep "avx2 " /proc/cpuinfo OUTPUT_VARIABLE AVX2_M) + if (AVX2_M MATCHES "avx2") + set(GGML_C_FLAGS "${GGML_C_FLAGS} -mavx2") + endif() + execute_process(COMMAND grep "fma " /proc/cpuinfo OUTPUT_VARIABLE FMA_M) + if (FMA_M MATCHES "fma") + set(GGML_C_FLAGS "${GGML_C_FLAGS} -mfma") + endif() + execute_process(COMMAND grep "f16c " /proc/cpuinfo OUTPUT_VARIABLE F16C_M) + if (F16C_M MATCHES "f16c") + set(GGML_C_FLAGS "${GGML_C_FLAGS} -mf16c") + endif() + execute_process(COMMAND grep "sse3 " /proc/cpuinfo OUTPUT_VARIABLE SSE3_M) + if (SSE3_M MATCHES "sse3") + set(GGML_C_FLAGS "${GGML_C_FLAGS} -msse3") + endif() + elseif (UNAME_S MATCHES "Haiku") + message(STATUS "Haiku detected") + execute_process(COMMAND sysinfo -cpu COMMAND grep "AVX " OUTPUT_VARIABLE AVX1_M) + if (AVX1_M MATCHES "avx") + set(GGML_C_FLAGS "${GGML_C_FLAGS} -mavx") + endif() + execute_process(COMMAND sysinfo -cpu COMMAND grep "AVX2 " OUTPUT_VARIABLE AVX2_M) + if (AVX2_M MATCHES "avx2") + set(GGML_C_FLAGS "${GGML_C_FLAGS} -mavx2") + endif() + execute_process(COMMAND sysinfo -cpu COMMAND grep "FMA " OUTPUT_VARIABLE FMA_M) + if (FMA_M MATCHES "fma") + set(GGML_C_FLAGS "${GGML_C_FLAGS} -mfma") + endif() + execute_process(COMMAND sysinfo -cpu COMMAND grep "F16C " OUTPUT_VARIABLE F16C_M) + if (F16C_M MATCHES "f16c") + set(GGML_C_FLAGS "${GGML_C_FLAGS} -mf16c") + endif() + else() + set(GGML_C_FLAGS "${GGML_C_FLAGS} -mfma -mf16c -mavx -mavx2") + endif() +endif() + # on APPLE - include Accelerate framework if (APPLE AND NOT GGML_NO_ACCELERATE) find_library(ACCELERATE_FRAMEWORK Accelerate) @@ -11,6 +101,31 @@ if (APPLE AND NOT GGML_NO_ACCELERATE) endif() endif() +if (GGML_OPENBLAS) + set(OPENBLAS_INCLUDE_SEARCH_PATHS + /usr/include + /usr/include/openblas + /usr/include/openblas-base + /usr/local/include + /usr/local/include/openblas + /usr/local/include/openblas-base + /opt/OpenBLAS/include + $ENV{OpenBLAS_HOME} + $ENV{OpenBLAS_HOME}/include + ) + find_path(OPENBLAS_INC NAMES cblas.h PATHS ${OPENBLAS_INCLUDE_SEARCH_PATHS}) + find_library(OPENBLAS_LIB NAMES openblas libopenblas) + if (OPENBLAS_LIB) + message(STATUS "OpenBLAS found") + + set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} ${OPENBLAS_LIB}) + set(GGML_EXTRA_INCS ${GGML_EXTRA_INCS} ${OPENBLAS_INC}) + set(GGML_EXTRA_FLAGS ${GGML_EXTRA_FLAGS} -DGGML_USE_OPENBLAS) + else() + message(WARNING "OpenBLAS not found") + endif() +endif() + # # test-vec0 @@ -26,7 +141,8 @@ if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "x86") add_executable(${TEST_TARGET} ${TEST_TARGET}.c) target_link_libraries(${TEST_TARGET} PRIVATE ggml) add_test(NAME ${TEST_TARGET} COMMAND $) - set_target_properties(${TEST_TARGET} PROPERTIES COMPILE_FLAGS "-mavx -mavx2 -mfma -mf16c") + #set_target_properties(${TEST_TARGET} PROPERTIES COMPILE_FLAGS "-mavx -mavx2 -mfma -mf16c") + set_target_properties(${TEST_TARGET} PROPERTIES COMPILE_FLAGS ${GGML_C_FLAGS}) endif() # @@ -51,7 +167,8 @@ add_test(NAME ${TEST_TARGET} COMMAND $) set(TEST_TARGET test-mul-mat0) add_executable(${TEST_TARGET} ${TEST_TARGET}.c) -target_link_libraries(${TEST_TARGET} PRIVATE ggml) +target_link_libraries(${TEST_TARGET} PRIVATE ggml ${GGML_EXTRA_LIBS}) +target_compile_options(${TEST_TARGET} PRIVATE ${GGML_EXTRA_FLAGS}) add_test(NAME ${TEST_TARGET} COMMAND $) # @@ -106,7 +223,7 @@ target_link_libraries(${TEST_TARGET} PRIVATE ggml) add_test(NAME ${TEST_TARGET} COMMAND $) # -# test-svd0 (arm) +# test-svd0 (arm/x86) if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" AND NOT GGML_NO_ACCELERATE) set(TEST_TARGET test-svd0) @@ -114,5 +231,11 @@ if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" AND NOT GGML_NO_ACCELERATE) target_link_libraries(${TEST_TARGET} PRIVATE ggml ${GGML_EXTRA_LIBS}) target_compile_options(${TEST_TARGET} PRIVATE ${GGML_EXTRA_FLAGS}) add_test(NAME ${TEST_TARGET} COMMAND $) +elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "x86" AND GGML_OPENBLAS) + set(TEST_TARGET test-svd0) + add_executable(${TEST_TARGET} ${TEST_TARGET}.c) + target_link_libraries(${TEST_TARGET} PRIVATE ggml ${GGML_EXTRA_LIBS}) + target_compile_options(${TEST_TARGET} PRIVATE ${GGML_EXTRA_FLAGS}) + add_test(NAME ${TEST_TARGET} COMMAND $) endif() diff --git a/tests/test-grad0.c b/tests/test-grad0.c index 4814b5929..2bbcf2554 100644 --- a/tests/test-grad0.c +++ b/tests/test-grad0.c @@ -210,7 +210,17 @@ int main(int argc, const char ** argv) { int ne[4]; - for (int iter = 0; iter < 1000; ++iter) { + // original loop: 1000 + int niter = 1000; + const char *env = getenv("GGML_NLOOP"); + if (env != NULL) { + niter = atoi(env); + } + if (argc > 1) { + niter = atoi(argv[1]); + } + for (int iter = 0; iter < niter; ++iter) { + printf("test-grad0: iter:%d/%d\n", iter, niter); struct ggml_context * ctx0 = ggml_init(params); get_random_dims(ne, 4); diff --git a/tests/test-mul-mat0.c b/tests/test-mul-mat0.c index 1215c4ca0..2a367c8e1 100644 --- a/tests/test-mul-mat0.c +++ b/tests/test-mul-mat0.c @@ -232,7 +232,17 @@ int main(int argc, const char ** argv) { int ne[4]; - for (int iter = 0; iter < 500; ++iter) { + // original loop: 500 + int niter = 500; + const char *env = getenv("GGML_NLOOP"); + if (env != NULL) { + niter = atoi(env); + } + if (argc > 1) { + niter = atoi(argv[1]); + } + for (int iter = 0; iter < niter; ++iter) { + printf("test-mul-mat0: iter:%d/%d\n", iter, niter); struct ggml_context * ctx0 = ggml_init(params); get_random_dims(ne, 4); diff --git a/tests/test-vec1.c b/tests/test-vec1.c index 850c62252..34688debd 100644 --- a/tests/test-vec1.c +++ b/tests/test-vec1.c @@ -97,10 +97,17 @@ void mul_mat_vec_f32_2( __m256 b1 = _mm256_loadu_ps(src1 + j + 8); __m256 b2 = _mm256_loadu_ps(src1 + j + 16); __m256 b3 = _mm256_loadu_ps(src1 + j + 24); +#if defined(__FMA__) sum0 = _mm256_fmadd_ps(a0, b0, sum0); sum1 = _mm256_fmadd_ps(a1, b1, sum1); sum2 = _mm256_fmadd_ps(a2, b2, sum2); sum3 = _mm256_fmadd_ps(a3, b3, sum3); +#else + sum0 = _mm256_add_ps(_mm256_mul_ps(a0, b0), sum0); + sum1 = _mm256_add_ps(_mm256_mul_ps(a1, b1), sum1); + sum2 = _mm256_add_ps(_mm256_mul_ps(a2, b2), sum2); + sum3 = _mm256_add_ps(_mm256_mul_ps(a3, b3), sum3); +#endif } dst[i] = reduce_vector8_0(_mm256_add_ps(_mm256_add_ps(sum0, sum1), _mm256_add_ps(sum2, sum3))); @@ -314,7 +321,11 @@ void mul_mat_vec_f16_0( for (int j = 0; j < ncols8; j += 8) { __m256 a = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(src0_row + j))); __m256 b = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(src1 + j))); +#if defined(__FMA__) sum = _mm256_fmadd_ps(a, b, sum); +#else + sum = _mm256_add_ps(_mm256_mul_ps(a, b), sum); +#endif } dst[i] = reduce_vector8_0(sum); @@ -343,8 +354,13 @@ void mul_mat_vec_f16_1( __m256 a1 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(src0_row + j + 8))); __m256 b0 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(src1 + j))); __m256 b1 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(src1 + j + 8))); +#if defined(__FMA__) sum0 = _mm256_fmadd_ps(a0, b0, sum0); sum1 = _mm256_fmadd_ps(a1, b1, sum1); +#else + sum0 = _mm256_add_ps(_mm256_mul_ps(a0, b0), sum0); + sum1 = _mm256_add_ps(_mm256_mul_ps(a1, b1), sum1); +#endif } dst[i] = reduce_vector8_0(sum0) + reduce_vector8_0(sum1); @@ -379,10 +395,17 @@ void mul_mat_vec_f16_2( __m256 b1 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(src1 + j + 8))); __m256 b2 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(src1 + j + 16))); __m256 b3 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(src1 + j + 24))); +#if defined(__FMA__) sum0 = _mm256_fmadd_ps(a0, b0, sum0); sum1 = _mm256_fmadd_ps(a1, b1, sum1); sum2 = _mm256_fmadd_ps(a2, b2, sum2); sum3 = _mm256_fmadd_ps(a3, b3, sum3); +#else + sum0 = _mm256_add_ps(_mm256_mul_ps(a0, b0), sum0); + sum1 = _mm256_add_ps(_mm256_mul_ps(a1, b1), sum1); + sum2 = _mm256_add_ps(_mm256_mul_ps(a2, b2), sum2); + sum3 = _mm256_add_ps(_mm256_mul_ps(a3, b3), sum3); +#endif } dst[i] = reduce_vector8_0(sum0) + reduce_vector8_0(sum1) + reduce_vector8_0(sum2) + reduce_vector8_0(sum3); @@ -417,10 +440,17 @@ void mul_mat_vec_f16_3( __m256 b1 = _mm256_loadu_ps(src1 + j + 8); __m256 b2 = _mm256_loadu_ps(src1 + j + 16); __m256 b3 = _mm256_loadu_ps(src1 + j + 24); +#if defined(__FMA__) sum0 = _mm256_fmadd_ps(a0, b0, sum0); sum1 = _mm256_fmadd_ps(a1, b1, sum1); sum2 = _mm256_fmadd_ps(a2, b2, sum2); sum3 = _mm256_fmadd_ps(a3, b3, sum3); +#else + sum0 = _mm256_add_ps(_mm256_mul_ps(a0, b0), sum0); + sum1 = _mm256_add_ps(_mm256_mul_ps(a1, b1), sum1); + sum2 = _mm256_add_ps(_mm256_mul_ps(a2, b2), sum2); + sum3 = _mm256_add_ps(_mm256_mul_ps(a3, b3), sum3); +#endif } dst[i] = reduce_vector8_0(sum0) + reduce_vector8_0(sum1) + reduce_vector8_0(sum2) + reduce_vector8_0(sum3); diff --git a/tests/test2.c b/tests/test2.c index 6ec52ab28..70f915820 100644 --- a/tests/test2.c +++ b/tests/test2.c @@ -20,7 +20,17 @@ int main(int argc, const char ** argv) { struct ggml_opt_params opt_params = ggml_opt_default_params(GGML_OPT_ADAM); opt_params.adam.alpha = 0.01f; - opt_params.n_threads = (argc > 1) ? atoi(argv[1]) : 8; + // original threads: 8 + int nthreads = 8; + const char *env = getenv("GGML_NTHREADS"); + if (env != NULL) { + nthreads = atoi(env); + } + if (argc > 1) { + nthreads = atoi(argv[1]); + } + opt_params.n_threads = nthreads; + printf("test2: n_threads:%d\n", opt_params.n_threads); const float xi[] = { 1.0f, 2.0f, 3.0f, 4.0f, 5.0f , 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, }; float yi[] = { 15.0f, 25.0f, 35.0f, 45.0f, 55.0f, 65.0f, 75.0f, 85.0f, 95.0f, 105.0f, };