From b19a21b84c63d990f658678be90f0a3b1663309f Mon Sep 17 00:00:00 2001 From: I <1091761+wx257osn2@users.noreply.github.com> Date: Fri, 26 Jan 2024 17:47:14 +0900 Subject: [PATCH 1/7] expand lambda --- include/qoixx.hpp | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/include/qoixx.hpp b/include/qoixx.hpp index b4061ac..8f54b88 100644 --- a/include/qoixx.hpp +++ b/include/qoixx.hpp @@ -304,10 +304,13 @@ class qoi{ private: template static inline void encode_body(Pusher& p, Puller& pixels, rgba_t (&index)[index_size], std::size_t px_len, rgba_t px_prev = {0, 0, 0, 255}, std::uint8_t prev_hash = static_cast(index_size), std::size_t run = 0){ - const auto f = [&run, &index, &p, &prev_hash](rgba_t px, rgba_t px_prev){ + auto px = px_prev; + while(px_len--)[[likely]]{ + px_prev = px; + pull(&px, pixels); if(px == px_prev){ ++run; - return; + continue; } if(run > 0){ while(run >= 62)[[unlikely]]{ @@ -333,7 +336,7 @@ class qoi{ if(index[index_pos] == px){ p.push(chunk_tag::index | index_pos); - return; + continue; } index[index_pos] = px; @@ -341,7 +344,7 @@ class qoi{ if(px.a != px_prev.a){ p.push(chunk_tag::rgba); push<4>(p, &px); - return; + continue; } const auto vr = static_cast(px.r) - static_cast(px_prev.r) + 2; const auto vg = static_cast(px.g) - static_cast(px_prev.g) + 2; @@ -349,7 +352,7 @@ class qoi{ if(const std::uint8_t v = vr|vg|vb; v < 4){ p.push(chunk_tag::diff | vr << 4 | vg << 2 | vb); - return; + continue; } const auto vg_r = vr - vg + 8; const auto vg_b = vb - vg + 8; @@ -361,12 +364,6 @@ class qoi{ p.push(chunk_tag::rgb); push<3>(p, &px); } - }; - auto px = px_prev; - while(px_len--)[[likely]]{ - px_prev = px; - pull(&px, pixels); - f(px, px_prev); } if(px == px_prev){ while(run >= 62)[[unlikely]]{ From 1928f70702b4252505bdff65b0d64a5b4183e4dd Mon Sep 17 00:00:00 2001 From: I <1091761+wx257osn2@users.noreply.github.com> Date: Fri, 26 Jan 2024 17:57:30 +0900 Subject: [PATCH 2/7] stop copying into px_prev if px is not changed --- include/qoixx.hpp | 78 +++++++++++++++++++++++------------------------ 1 file changed, 39 insertions(+), 39 deletions(-) diff --git a/include/qoixx.hpp b/include/qoixx.hpp index 8f54b88..c014297 100644 --- a/include/qoixx.hpp +++ b/include/qoixx.hpp @@ -306,7 +306,6 @@ class qoi{ static inline void encode_body(Pusher& p, Puller& pixels, rgba_t (&index)[index_size], std::size_t px_len, rgba_t px_prev = {0, 0, 0, 255}, std::uint8_t prev_hash = static_cast(index_size), std::size_t run = 0){ auto px = px_prev; while(px_len--)[[likely]]{ - px_prev = px; pull(&px, pixels); if(px == px_prev){ ++run; @@ -334,47 +333,48 @@ class qoi{ const auto index_pos = px.hash() % index_size; prev_hash = index_pos; - if(index[index_pos] == px){ - p.push(chunk_tag::index | index_pos); - continue; - } - index[index_pos] = px; - - if constexpr(Channels == 4) - if(px.a != px_prev.a){ - p.push(chunk_tag::rgba); - push<4>(p, &px); - continue; + do{ + if(index[index_pos] == px){ + p.push(chunk_tag::index | index_pos); + break; } - const auto vr = static_cast(px.r) - static_cast(px_prev.r) + 2; - const auto vg = static_cast(px.g) - static_cast(px_prev.g) + 2; - const auto vb = static_cast(px.b) - static_cast(px_prev.b) + 2; + index[index_pos] = px; - if(const std::uint8_t v = vr|vg|vb; v < 4){ - p.push(chunk_tag::diff | vr << 4 | vg << 2 | vb); - continue; - } - const auto vg_r = vr - vg + 8; - const auto vg_b = vb - vg + 8; - if(const int v = vg_r|vg_b, g = vg+30; ((v&0xf0)|(g&0xc0)) == 0){ - p.push(chunk_tag::luma | g); - p.push(vg_r << 4 | vg_b); - } - else{ - p.push(chunk_tag::rgb); - push<3>(p, &px); - } + if constexpr(Channels == 4) + if(px.a != px_prev.a){ + p.push(chunk_tag::rgba); + push<4>(p, &px); + break; + } + const auto vr = static_cast(px.r) - static_cast(px_prev.r) + 2; + const auto vg = static_cast(px.g) - static_cast(px_prev.g) + 2; + const auto vb = static_cast(px.b) - static_cast(px_prev.b) + 2; + + if(const std::uint8_t v = vr|vg|vb; v < 4){ + p.push(chunk_tag::diff | vr << 4 | vg << 2 | vb); + break; + } + const auto vg_r = vr - vg + 8; + const auto vg_b = vb - vg + 8; + if(const int v = vg_r|vg_b, g = vg+30; ((v&0xf0)|(g&0xc0)) == 0){ + p.push(chunk_tag::luma | g); + p.push(vg_r << 4 | vg_b); + } + else{ + p.push(chunk_tag::rgb); + push<3>(p, &px); + } + }while(false); + px_prev = px; } - if(px == px_prev){ - while(run >= 62)[[unlikely]]{ - static constexpr std::uint8_t x = chunk_tag::run | 61; - p.push(x); - run -= 62; - } - if(run > 0){ - p.push(chunk_tag::run | (run-1)); - run = 0; - } + while(run >= 62)[[unlikely]]{ + static constexpr std::uint8_t x = chunk_tag::run | 61; + p.push(x); + run -= 62; + } + if(run > 0){ + p.push(chunk_tag::run | (run-1)); + run = 0; } } #ifndef QOIXX_NO_SIMD From 0c3dbf6bcf0e5086b4f8e51ef4383312a8b94c31 Mon Sep 17 00:00:00 2001 From: I <1091761+wx257osn2@users.noreply.github.com> Date: Sat, 3 Feb 2024 15:00:39 +0900 Subject: [PATCH 3/7] skip unused writing --- include/qoixx.hpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/include/qoixx.hpp b/include/qoixx.hpp index c014297..5a84854 100644 --- a/include/qoixx.hpp +++ b/include/qoixx.hpp @@ -372,10 +372,8 @@ class qoi{ p.push(x); run -= 62; } - if(run > 0){ + if(run > 0) p.push(chunk_tag::run | (run-1)); - run = 0; - } } #ifndef QOIXX_NO_SIMD #if defined(__ARM_FEATURE_SVE) From d0ee4c2ad2bef07a954c32629613c444e5b5d6b4 Mon Sep 17 00:00:00 2001 From: I <1091761+wx257osn2@users.noreply.github.com> Date: Fri, 26 Jan 2024 18:09:28 +0900 Subject: [PATCH 4/7] skip calculates if g is bigger --- include/qoixx.hpp | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/include/qoixx.hpp b/include/qoixx.hpp index 5a84854..8ba1157 100644 --- a/include/qoixx.hpp +++ b/include/qoixx.hpp @@ -346,24 +346,26 @@ class qoi{ push<4>(p, &px); break; } - const auto vr = static_cast(px.r) - static_cast(px_prev.r) + 2; - const auto vg = static_cast(px.g) - static_cast(px_prev.g) + 2; - const auto vb = static_cast(px.b) - static_cast(px_prev.b) + 2; + const auto vg_2 = static_cast(px.g) - static_cast(px_prev.g); + if(const int g = vg_2+32; (g&0xc0) == 0){ + const auto vr = static_cast(px.r) - static_cast(px_prev.r) + 2; + const auto vg = vg_2 + 2; + const auto vb = static_cast(px.b) - static_cast(px_prev.b) + 2; - if(const std::uint8_t v = vr|vg|vb; v < 4){ - p.push(chunk_tag::diff | vr << 4 | vg << 2 | vb); - break; - } - const auto vg_r = vr - vg + 8; - const auto vg_b = vb - vg + 8; - if(const int v = vg_r|vg_b, g = vg+30; ((v&0xf0)|(g&0xc0)) == 0){ - p.push(chunk_tag::luma | g); - p.push(vg_r << 4 | vg_b); - } - else{ - p.push(chunk_tag::rgb); - push<3>(p, &px); + if(const std::uint8_t v = vr|vg|vb; v < 4){ + p.push(chunk_tag::diff | vr << 4 | vg << 2 | vb); + break; + } + const auto vg_r = vr - vg + 8; + const auto vg_b = vb - vg + 8; + if(const int v = vg_r|vg_b; (v&0xf0) == 0){ + p.push(chunk_tag::luma | g); + p.push(vg_r << 4 | vg_b); + break; + } } + p.push(chunk_tag::rgb); + push<3>(p, &px); }while(false); px_prev = px; } From e392487c293548823740426d191cbf1575f4da54 Mon Sep 17 00:00:00 2001 From: I <1091761+wx257osn2@users.noreply.github.com> Date: Fri, 26 Jan 2024 18:10:18 +0900 Subject: [PATCH 5/7] use arithmetic comparison intead of bit operation --- include/qoixx.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/qoixx.hpp b/include/qoixx.hpp index 8ba1157..2f3d416 100644 --- a/include/qoixx.hpp +++ b/include/qoixx.hpp @@ -347,18 +347,18 @@ class qoi{ break; } const auto vg_2 = static_cast(px.g) - static_cast(px_prev.g); - if(const int g = vg_2+32; (g&0xc0) == 0){ + if(const std::uint8_t g = vg_2+32; g < 64){ const auto vr = static_cast(px.r) - static_cast(px_prev.r) + 2; const auto vg = vg_2 + 2; const auto vb = static_cast(px.b) - static_cast(px_prev.b) + 2; - if(const std::uint8_t v = vr|vg|vb; v < 4){ + if(static_cast(vr|vg|vb) < 4){ p.push(chunk_tag::diff | vr << 4 | vg << 2 | vb); break; } const auto vg_r = vr - vg + 8; const auto vg_b = vb - vg + 8; - if(const int v = vg_r|vg_b; (v&0xf0) == 0){ + if(static_cast(vg_r|vg_b) < 16){ p.push(chunk_tag::luma | g); p.push(vg_r << 4 | vg_b); break; From f434234d84d716c971b58f21885587affde5776b Mon Sep 17 00:00:00 2001 From: I <1091761+wx257osn2@users.noreply.github.com> Date: Fri, 26 Jan 2024 18:17:36 +0900 Subject: [PATCH 6/7] write chunk_tag::rgb and the pixel data at once --- include/qoixx.hpp | 79 ++++++++++++++++++++++++++++++++++++----------- 1 file changed, 61 insertions(+), 18 deletions(-) diff --git a/include/qoixx.hpp b/include/qoixx.hpp index 2f3d416..a4a5269 100644 --- a/include/qoixx.hpp +++ b/include/qoixx.hpp @@ -248,6 +248,19 @@ class qoi{ }; struct rgb_t{ std::uint8_t r, g, b; + inline std::uint32_t v()const{ + static_assert(sizeof(rgb_t) == 3u); + if constexpr(std::endian::native == std::endian::little){ + std::uint32_t x = 255u << 24u; + efficient_memcpy<3>(&x, this); + return x; + } + else + return std::uint32_t{r} | + std::uint32_t{g} << 8 | + std::uint32_t{b} << 16 | + 255u << 24; + } inline std::uint_fast32_t hash()const{ static constexpr std::uint64_t constant = static_cast(3u) << 56 | @@ -302,12 +315,27 @@ class qoi{ } } private: + template + using local_rgba_pixel_t = std::conditional_t; + template + static constexpr local_rgba_pixel_t default_pixel()noexcept{ + if constexpr(Alpha) + return {0, 0, 0, 255}; + else + return {}; + } + template + struct local_pixel{ + std::uint8_t rgb = static_cast(chunk_tag::rgb); + local_rgba_pixel_t v; + }; + static_assert(std::has_unique_object_representations_v> and std::has_unique_object_representations_v>); template - static inline void encode_body(Pusher& p, Puller& pixels, rgba_t (&index)[index_size], std::size_t px_len, rgba_t px_prev = {0, 0, 0, 255}, std::uint8_t prev_hash = static_cast(index_size), std::size_t run = 0){ - auto px = px_prev; + static inline void encode_body(Pusher& p, Puller& pixels, rgba_t (&index)[index_size], std::size_t px_len, local_rgba_pixel_t px_prev = default_pixel(), std::uint8_t prev_hash = static_cast(index_size), std::size_t run = 0){ + local_pixel px; while(px_len--)[[likely]]{ - pull(&px, pixels); - if(px == px_prev){ + pull(&px.v, pixels); + if(px.v.v() == px_prev.v()){ ++run; continue; } @@ -330,27 +358,29 @@ class qoi{ } } - const auto index_pos = px.hash() % index_size; + const auto index_pos = px.v.hash() % index_size; prev_hash = index_pos; do{ - if(index[index_pos] == px){ + if(index[index_pos].v() == px.v.v()){ p.push(chunk_tag::index | index_pos); break; } - index[index_pos] = px; + efficient_memcpy(index + index_pos, &px.v); + if constexpr(Channels == 3) + index[index_pos].a = 255u; if constexpr(Channels == 4) - if(px.a != px_prev.a){ + if(px.v.a != px_prev.a){ p.push(chunk_tag::rgba); - push<4>(p, &px); + push<4>(p, &px.v); break; } - const auto vg_2 = static_cast(px.g) - static_cast(px_prev.g); + const auto vg_2 = static_cast(px.v.g) - static_cast(px_prev.g); if(const std::uint8_t g = vg_2+32; g < 64){ - const auto vr = static_cast(px.r) - static_cast(px_prev.r) + 2; + const auto vr = static_cast(px.v.r) - static_cast(px_prev.r) + 2; const auto vg = vg_2 + 2; - const auto vb = static_cast(px.b) - static_cast(px_prev.b) + 2; + const auto vb = static_cast(px.v.b) - static_cast(px_prev.b) + 2; if(static_cast(vr|vg|vb) < 4){ p.push(chunk_tag::diff | vr << 4 | vg << 2 | vb); @@ -361,13 +391,14 @@ class qoi{ if(static_cast(vg_r|vg_b) < 16){ p.push(chunk_tag::luma | g); p.push(vg_r << 4 | vg_b); - break; } + else + push<4>(p, &px); } - p.push(chunk_tag::rgb); - push<3>(p, &px); + else + push<4>(p, &px); }while(false); - px_prev = px; + efficient_memcpy(&px_prev, &px.v); } while(run >= 62)[[unlikely]]{ static constexpr std::uint8_t x = chunk_tag::run | 61; @@ -712,7 +743,13 @@ class qoi{ } p_.advance(p-p_.raw_pointer()); - encode_body(p_, pixels_, index, px_len, px, prev_hash, run); + if constexpr(Alpha) + encode_body(p_, pixels_, index, px_len, px, prev_hash, run); + else{ + rgb_t px_prev; + efficient_memcpy<3>(&px_prev, &px); + encode_body(p_, pixels_, index, px_len, px_prev, prev_hash, run); + } push(p_, padding); } @@ -984,7 +1021,13 @@ class qoi{ } p_.advance(p-p_.raw_pointer()); - encode_body(p_, pixels_, index, px_len, px, prev_hash, run); + if constexpr(Alpha) + encode_body(p_, pixels_, index, px_len, px, prev_hash, run); + else{ + rgb_t px_prev; + efficient_memcpy<3>(&px_prev, &px); + encode_body(p_, pixels_, index, px_len, px_prev, prev_hash, run); + } push(p_, padding); } From f381f6d462227b1f7e2c6f4fce60876543596e0a Mon Sep 17 00:00:00 2001 From: I <1091761+wx257osn2@users.noreply.github.com> Date: Sat, 3 Feb 2024 13:32:56 +0900 Subject: [PATCH 7/7] deinterleave luma data --- include/qoixx.hpp | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/include/qoixx.hpp b/include/qoixx.hpp index a4a5269..54c4871 100644 --- a/include/qoixx.hpp +++ b/include/qoixx.hpp @@ -516,12 +516,12 @@ class qoi{ hash = svand_n_u8_x(mask, svadd_u8_x(mask, svadd_u8_x(mask, svmul_n_u8_x(mask, get<0>(pxs), 3), svmul_n_u8_x(mask, get<1>(pxs), 5)), svadd_u8_x(mask, svmul_n_u8_x(mask, get<2>(pxs), 7), svmul_n_u8_x(mask, get<3>(pxs), 11))), 63); else hash = svand_n_u8_x(mask, svadd_u8_x(mask, svadd_u8_x(mask, svmul_n_u8_x(mask, get<0>(pxs), 3), svmul_n_u8_x(mask, get<1>(pxs), 5)), svadd_n_u8_x(mask, svmul_n_u8_x(mask, get<2>(pxs), 7), static_cast(255*11))), 63); - std::uint8_t runs[SVERegisterSize/8], diffs[SVERegisterSize/8], lus[SVERegisterSize/8], mas[SVERegisterSize/8], hashs[SVERegisterSize/8]; + std::uint8_t runs[SVERegisterSize/8], diffs[SVERegisterSize/8], lumas[SVERegisterSize/8*2], hashs[SVERegisterSize/8]; [[maybe_unused]] std::uint8_t alphas[SVERegisterSize/8]; svst1_u8(mask, runs, svadd_n_u8_m(runv, zero, 1)); svst1_u8(mask, diffs, diffv); - svst1_u8(mask, lus, lu); - svst1_u8(mask, mas, ma); + const auto luma = svcreate2_u8(lu, ma); + svst2_u8(mask, lumas, luma); svst1_u8(mask, hashs, hash); if constexpr(Alpha) if(!alpha) @@ -562,9 +562,9 @@ class qoi{ } if(diffs[i]) *p++ = diffs[i]; - else if(lus[i]){ - *p++ = lus[i]; - *p++ = mas[i]; + else if(lumas[i*2]){ + std::memcpy(p, lumas + i*2, 2); + p += 2; } else{ *p++ = chunk_tag::rgb; @@ -683,12 +683,11 @@ class qoi{ hash = vandq_u8(vaddq_u8(vaddq_u8(vmulq_u8(pxs.val[0], vdupq_n_u8(3)), vmulq_u8(pxs.val[1], vdupq_n_u8(5))), vaddq_u8(vmulq_u8(pxs.val[2], vdupq_n_u8(7)), vmulq_u8(pxs.val[3], vdupq_n_u8(11)))), vdupq_n_u8(63)); else hash = vandq_u8(vaddq_u8(vaddq_u8(vmulq_u8(pxs.val[0], vdupq_n_u8(3)), vmulq_u8(pxs.val[1], vdupq_n_u8(5))), vaddq_u8(vmulq_u8(pxs.val[2], vdupq_n_u8(7)), vdupq_n_u8(static_cast(255*11)))), vdupq_n_u8(63)); - std::uint8_t runs[simd_lanes], diffs[simd_lanes], lus[simd_lanes], mas[simd_lanes], hashs[simd_lanes]; + std::uint8_t runs[simd_lanes], diffs[simd_lanes], lumas[simd_lanes*2], hashs[simd_lanes]; [[maybe_unused]] std::uint8_t alphas[simd_lanes]; vst1q_u8(runs, runv); vst1q_u8(diffs, diffv); - vst1q_u8(lus, lu); - vst1q_u8(mas, ma); + vst2q_u8(lumas, (uint8x16x2_t{lu, ma})); vst1q_u8(hashs, hash); if constexpr(Alpha) if(!alpha) @@ -729,9 +728,9 @@ class qoi{ } if(diffs[i]) *p++ = diffs[i]; - else if(lus[i]){ - *p++ = lus[i]; - *p++ = mas[i]; + else if(lumas[i*2]){ + std::memcpy(p, lumas + i*2, 2); + p += 2; } else{ *p++ = chunk_tag::rgb; @@ -954,19 +953,20 @@ class qoi{ diff.val[0] = _mm256_add_epi8(_mm256_sub_epi8(diff.val[0], diff.val[1]), eight); diff.val[2] = _mm256_add_epi8(_mm256_sub_epi8(diff.val[2], diff.val[1]), eight); diff.val[1] = _mm256_add_epi8(diff.val[1], _mm256_set1_epi8(30)); - const auto lu = _mm256_and_si256(_mm256_or_si256(_mm256_set1_epi8(static_cast(chunk_tag::luma)), diff.val[1]), _mm256_cmpeq_epi8(_mm256_or_si256(_mm256_and_si256(_mm256_or_si256(diff.val[0], diff.val[2]), _mm256_set1_epi8(static_cast(0xf0))), _mm256_and_si256(diff.val[1], _mm256_set1_epi8(static_cast(0xc0)))), zero)); - const auto ma = _mm256_or_si256(slli_epi8<4>(diff.val[0]), diff.val[2]); + const auto luma_mask = _mm256_setr_epi32(0, 1, 4, 5, 2, 3, 6, 7); + const auto lu = _mm256_permutevar8x32_epi32(_mm256_and_si256(_mm256_or_si256(_mm256_set1_epi8(static_cast(chunk_tag::luma)), diff.val[1]), _mm256_cmpeq_epi8(_mm256_or_si256(_mm256_and_si256(_mm256_or_si256(diff.val[0], diff.val[2]), _mm256_set1_epi8(static_cast(0xf0))), _mm256_and_si256(diff.val[1], _mm256_set1_epi8(static_cast(0xc0)))), zero)), luma_mask); + const auto ma = _mm256_permutevar8x32_epi32(_mm256_or_si256(slli_epi8<4>(diff.val[0]), diff.val[2]), luma_mask); __m256i hash; if constexpr(Alpha) hash = _mm256_and_si256(_mm256_add_epi8(_mm256_add_epi8(mul_epi8<3>(pxs.val[0]), mul_epi8<5>(pxs.val[1])), _mm256_add_epi8(mul_epi8<7>(pxs.val[2]), mul_epi8<11>(pxs.val[3]))), _mm256_set1_epi8(63)); else hash = _mm256_and_si256(_mm256_add_epi8(_mm256_add_epi8(mul_epi8<3>(pxs.val[0]), mul_epi8<5>(pxs.val[1])), _mm256_add_epi8(mul_epi8<7>(pxs.val[2]), _mm256_set1_epi8(static_cast(255*11)))), _mm256_set1_epi8(63)); - alignas(alignof(__m256i)) std::uint8_t runs[simd_lanes], diffs[simd_lanes], lus[simd_lanes], mas[simd_lanes], hashs[simd_lanes]; + alignas(alignof(__m256i)) std::uint8_t runs[simd_lanes], diffs[simd_lanes], lumas[simd_lanes*2], hashs[simd_lanes]; [[maybe_unused]] alignas(alignof(__m256i)) std::uint8_t alphas[simd_lanes]; _mm256_store_si256(reinterpret_cast<__m256i*>(runs), runv); _mm256_store_si256(reinterpret_cast<__m256i*>(diffs), diffv); - _mm256_store_si256(reinterpret_cast<__m256i*>(lus), lu); - _mm256_store_si256(reinterpret_cast<__m256i*>(mas), ma); + _mm256_store_si256(reinterpret_cast<__m256i*>(lumas), _mm256_unpacklo_epi8(lu, ma)); + _mm256_store_si256(reinterpret_cast<__m256i*>(lumas)+1, _mm256_unpackhi_epi8(lu, ma)); _mm256_store_si256(reinterpret_cast<__m256i*>(hashs), hash); if constexpr(Alpha) if(!alpha) @@ -1007,9 +1007,9 @@ class qoi{ } if(diffs[i]) *p++ = diffs[i]; - else if(lus[i]){ - *p++ = lus[i]; - *p++ = mas[i]; + else if(lumas[i*2]){ + std::memcpy(p, lumas + i*2, 2); + p += 2; } else{ *p++ = chunk_tag::rgb;