diff --git a/Makefile b/Makefile index 1601079a48685..feda0e3f90755 100644 --- a/Makefile +++ b/Makefile @@ -30,8 +30,8 @@ endif # Compile flags # -CFLAGS = -I. -O3 -DNDEBUG -std=c11 -fPIC -CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC +CFLAGS = -I. -O3 -DNDEBUG -std=c11 -fPIC -D_GNU_SOURCE +CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC -D_GNU_SOURCE LDFLAGS = # OS specific diff --git a/ggml.c b/ggml.c index 4fb83adbdb2c8..79dded6f4b002 100644 --- a/ggml.c +++ b/ggml.c @@ -76,6 +76,8 @@ static int sched_yield (void) { typedef void* thread_ret_t; #endif +#include + #ifdef __HAIKU__ #define static_assert(cond, msg) _Static_assert(cond, msg) #endif @@ -1425,7 +1427,58 @@ inline static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void __m256 acc = _mm256_setzero_ps(); // Main loop - for (int i = 0; i < nb; ++i) { + const int unroll_count = 4; + const int loop_count = nb / unroll_count; + for (int j = 0; j < loop_count; ++j) { + #pragma unroll + for (int idx = 0; idx < unroll_count; ++idx) { + // determin the actual index in the loop + const int i = j * unroll_count + idx; + const float * d0_0 = (const float *) (pd0 + i*bs); + const float * d1_0 = (const float *) (pd1 + i*bs); + + const uint8_t * restrict p0 = pb0 + i*bs; + const uint8_t * restrict p1 = pb1 + i*bs; + + // Prefetch data used later in the loop + // TODO these numbersi are device dependent shouldn't be hard coded derive + _mm_prefetch (d0_0 + 32*bs, 1); + _mm_prefetch (d1_0 + 32*bs, 1); + _mm_prefetch (p0 + 32*bs, 1); + _mm_prefetch (p1 + 32*bs, 1); + + // Compute combined scale for the block + const __m256 scale = _mm256_mul_ps( _mm256_broadcast_ss( d0_0 ), _mm256_broadcast_ss( d1_0 ) ); + + // Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes + __m256i bx = bytesFromNibbles( p0 ); + __m256i by = bytesFromNibbles( p1 ); + + // Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval. + const __m256i off = _mm256_set1_epi8( 8 ); + bx = _mm256_sub_epi8( bx, off ); + by = _mm256_sub_epi8( by, off ); + + // Sign-extend first 16 signed bytes into int16_t + __m256i x16 = _mm256_cvtepi8_epi16( _mm256_castsi256_si128( bx ) ); + __m256i y16 = _mm256_cvtepi8_epi16( _mm256_castsi256_si128( by ) ); + // Compute products of int16_t integers, add pairwise + __m256i i32 = _mm256_madd_epi16( x16, y16 ); + + // Sign-extend last 16 signed bytes into int16_t vectors + x16 = _mm256_cvtepi8_epi16( _mm256_extracti128_si256( bx, 1 ) ); + y16 = _mm256_cvtepi8_epi16( _mm256_extracti128_si256( by, 1 ) ); + // Accumulate products of int16_t integers + i32 = _mm256_add_epi32( i32, _mm256_madd_epi16( x16, y16 ) ); + + // Convert int32_t to float + __m256 p = _mm256_cvtepi32_ps( i32 ); + // Apply the scale, and accumulate + acc = _mm256_fmadd_ps( scale, p, acc ); + } + } + // TODO extract the loop here to eliminate duplicated code + for (int i = loop_count * unroll_count; i < nb; ++i) { const float * d0_0 = (const float *) (pd0 + i*bs); const float * d1_0 = (const float *) (pd1 + i*bs); @@ -1928,7 +1981,7 @@ inline static void ggml_vec_mad_q4_1(const int n, float * restrict y, void * res const size_t bs = 2*sizeof(float) + QK/2; const uint8_t * restrict pd = ((const uint8_t *)x + 0*bs); - const uint8_t * restrict pm = ((const uint8_t *)x + 0*bs + sizeof(float)); + const uint8_t * restrict pm = ((const uint8_t *)x + 0*bs + sizeof(float)); const uint8_t * restrict pb = ((const uint8_t *)x + 0*bs + 2*sizeof(float)); for (int i = 0; i < nb; i++) { @@ -5572,7 +5625,7 @@ static void ggml_compute_forward_rms_norm_f32( mean /= ne00; float * y = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3); - + memcpy(y, x, ne00 * sizeof(float)); // for (int i00 = 0; i00 < ne00; i00++) { // y[i00] = x[i00]; @@ -9271,7 +9324,6 @@ struct ggml_compute_state { static thread_ret_t ggml_graph_compute_thread(void * data) { struct ggml_compute_state * state = (struct ggml_compute_state *) data; - const int n_threads = state->shared->n_threads; while (true) { @@ -9350,11 +9402,25 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) }; int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]); + + // pin threads to cpu + cpu_set_t cpuset; + CPU_ZERO(&cpuset); + // TODO this assumes n_threads is the same as n_cpu which is not always true + CPU_SET(j+1, &cpuset); + pthread_setaffinity_np(workers[j].thrd, sizeof(cpu_set_t), &cpuset); + GGML_ASSERT(rc == 0); UNUSED(rc); } } + // set main thread affinity to 0 + cpu_set_t cpuset; + CPU_ZERO(&cpuset); + CPU_SET(0, &cpuset); + sched_setaffinity(0, sizeof(cpu_set_t), &cpuset); + // initialize tasks + work buffer { size_t work_size = 0;