Skip to content

try to fix compile warnings on macOS, address issue #2036 #2037

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 10 additions & 1 deletion ggml.c
Original file line number Diff line number Diff line change
@@ -574,6 +574,8 @@ static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
#define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1)

#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)

#if !__ARM_NEON && !__AVX2__ && (__AVX__ || __SSSE3__)
// multiply int8_t, add results pairwise twice
static inline __m128i mul_sum_i8_pairs(const __m128i x, const __m128i y) {
// Get absolute values of x vectors
@@ -585,6 +587,7 @@ static inline __m128i mul_sum_i8_pairs(const __m128i x, const __m128i y) {
const __m128i ones = _mm_set1_epi16(1);
return _mm_madd_epi16(ones, dot);
}
#endif

#if __AVX__ || __AVX2__ || __AVX512F__
// horizontally add 8 floats
@@ -605,13 +608,15 @@ static inline int hsum_i32_8(const __m256i a) {
return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32));
}

#if !__AVX2__ && __AVX__
// horizontally add 4 int32_t
static inline int hsum_i32_4(const __m128i a) {
const __m128i hi64 = _mm_unpackhi_epi64(a, a);
const __m128i sum64 = _mm_add_epi32(hi64, a);
const __m128i hi32 = _mm_shuffle_epi32(sum64, _MM_SHUFFLE(2, 3, 0, 1));
return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32));
}
#endif

#if defined(__AVX2__) || defined(__AVX512F__)
// spread 32 bits to 32 bytes { 0x00, 0xFF }
@@ -671,6 +676,7 @@ static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) {
#endif
}

/*
static inline __m128i packNibbles( __m256i bytes )
{
// Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh
@@ -691,6 +697,7 @@ static inline __m128i packNibbles( __m256i bytes )
return _mm_packus_epi16( r0, r1 );
#endif
}
*/
#elif defined(__AVX__)
// spread 32 bits to 32 bytes { 0x00, 0xFF }
static inline __m256i bytes_from_bits_32(const uint8_t * x) {
@@ -759,6 +766,7 @@ static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) {
return sum_i16_pairs_float(doth, dotl);
}

/*
static inline __m128i packNibbles( __m128i bytes1, __m128i bytes2 )
{
// Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh
@@ -774,6 +782,7 @@ static inline __m128i packNibbles( __m128i bytes1, __m128i bytes2 )

return _mm_packus_epi16( bytes1, bytes2);
}
*/
#endif
#elif defined(__SSSE3__)
// horizontally add 4x4 floats
@@ -1883,7 +1892,7 @@ quantize_fns_t ggml_internal_get_quantize_fn(size_t i) {
const __m128 t0 = _mm_add_ps(_mm256_castps256_ps128(x[0]), \
_mm256_extractf128_ps(x[0], 1)); \
const __m128 t1 = _mm_hadd_ps(t0, t0); \
res = _mm_cvtss_f32(_mm_hadd_ps(t1, t1)); \
res = (ggml_float)_mm_cvtss_f32(_mm_hadd_ps(t1, t1)); \
}
// TODO: is this optimal ?