From ceae03c17b4095a7612dbbc7be864ca9ff0f872f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ferenc=20Pint=C3=A9r?= Date: Sat, 29 Jan 2022 10:10:05 +0100 Subject: [PATCH] Overlay blend: rewrite. 32bit float calculation inside (#255) --- avs_core/filters/overlay/OF_blend.cpp | 210 +++++--- avs_core/filters/overlay/blend_common.cpp | 163 ++++-- avs_core/filters/overlay/blend_common.h | 36 +- .../overlay/intel/blend_common_avx2.cpp | 230 +++++++++ .../filters/overlay/intel/blend_common_avx2.h | 40 ++ .../overlay/intel/blend_common_sse.cpp | 470 +++++++++++++----- .../filters/overlay/intel/blend_common_sse.h | 51 +- avs_core/filters/overlay/overlay.cpp | 7 +- avs_core/filters/overlay/overlayfunctions.h | 3 +- 9 files changed, 935 insertions(+), 275 deletions(-) create mode 100644 avs_core/filters/overlay/intel/blend_common_avx2.cpp create mode 100644 avs_core/filters/overlay/intel/blend_common_avx2.h diff --git a/avs_core/filters/overlay/OF_blend.cpp b/avs_core/filters/overlay/OF_blend.cpp index db157c7fe5..e469168394 100644 --- a/avs_core/filters/overlay/OF_blend.cpp +++ b/avs_core/filters/overlay/OF_blend.cpp @@ -38,6 +38,7 @@ #include "blend_common.h" #ifdef INTEL_INTRINSICS #include "intel/blend_common_sse.h" +#include "intel/blend_common_avx2.h" #endif @@ -72,7 +73,7 @@ void OL_BlendImage::BlendImageMask(ImageOverlayInternal* base, ImageOverlayInter int planeindex_from = 0; int planeindex_to = 0; - if (of_mode == OF_Blend) { + if (of_mode == OF_Blend || of_mode == OF_Blend_Compat) { planeindex_from = 0; planeindex_to = greyscale ? 0 : 2; } @@ -87,14 +88,63 @@ void OL_BlendImage::BlendImageMask(ImageOverlayInternal* base, ImageOverlayInter planeindex_to = 2; } - if ((opacity == 256 && pixelsize != 4) || (opacity_f == 1.0f && pixelsize == 4)) { - overlay_blend_plane_masked_t* blend_fn = nullptr; + overlay_blend_plane_masked_opacity_t* blend_fn = nullptr; + if (of_mode != OF_Blend_Compat || pixelsize == 4) { + // independent from full/not full opacity #ifdef INTEL_INTRINSICS - if (pixelsize == 4 && (env->GetCPUFlags() & CPUF_SSE2)) { - blend_fn = overlay_blend_sse2_plane_masked_float; + if (pixelsize == 4 && (env->GetCPUFlags() & CPUF_AVX2)) { + blend_fn = overlay_blend_avx2_float; } - else if (pixelsize == 2 && (env->GetCPUFlags() & CPUF_SSE4_1)) { + else if (pixelsize == 4 && (env->GetCPUFlags() & CPUF_SSE2)) { + blend_fn = overlay_blend_sse2_float; + } + else if (env->GetCPUFlags() & CPUF_AVX2) { + switch (bits_per_pixel) { + case 8: blend_fn = overlay_blend_avx2_uint; break; + case 10: blend_fn = overlay_blend_avx2_uint; break; + case 12: blend_fn = overlay_blend_avx2_uint; break; + case 14: blend_fn = overlay_blend_avx2_uint; break; + case 16: blend_fn = overlay_blend_avx2_uint; break; + } + } + else if (env->GetCPUFlags() & CPUF_SSE4_1) { + switch (bits_per_pixel) { + case 8: blend_fn = overlay_blend_sse41_uint; break; + case 10: blend_fn = overlay_blend_sse41_uint; break; + case 12: blend_fn = overlay_blend_sse41_uint; break; + case 14: blend_fn = overlay_blend_sse41_uint; break; + case 16: blend_fn = overlay_blend_sse41_uint; break; + } + } + else if (env->GetCPUFlags() & CPUF_SSE2) { + switch (bits_per_pixel) { + case 8: blend_fn = overlay_blend_sse2_uint; break; + case 10: blend_fn = overlay_blend_sse2_uint; break; + case 12: blend_fn = overlay_blend_sse2_uint; break; + case 14: blend_fn = overlay_blend_sse2_uint; break; + case 16: blend_fn = overlay_blend_sse2_uint; break; + } + } + else +#endif // INTEL_INTRINSICS + { + // pure C + switch (bits_per_pixel) { + case 8: blend_fn = overlay_blend_c_uint; break; + case 10: blend_fn = overlay_blend_c_uint; break; + case 12: blend_fn = overlay_blend_c_uint; break; + case 14: blend_fn = overlay_blend_c_uint; break; + case 16: blend_fn = overlay_blend_c_uint; break; + case 32: blend_fn = overlay_blend_c_float; break; + } + } + // end of new, float precision inside masked overlays + } + else if (opacity == 256) { + // specialized functions for full opacity +#ifdef INTEL_INTRINSICS + if (pixelsize == 2 && (env->GetCPUFlags() & CPUF_SSE4_1)) { switch (bits_per_pixel) { case 10: blend_fn = overlay_blend_sse41_plane_masked; break; case 12: blend_fn = overlay_blend_sse41_plane_masked; break; @@ -102,9 +152,6 @@ void OL_BlendImage::BlendImageMask(ImageOverlayInternal* base, ImageOverlayInter case 16: blend_fn = overlay_blend_sse41_plane_masked; break; } } - else if (pixelsize == 1 && (env->GetCPUFlags() & CPUF_SSE4_1)) { - blend_fn = overlay_blend_sse41_plane_masked; - } else if (pixelsize == 1 && (env->GetCPUFlags() & CPUF_SSE2)) { blend_fn = overlay_blend_sse2_plane_masked; } @@ -117,34 +164,21 @@ void OL_BlendImage::BlendImageMask(ImageOverlayInternal* base, ImageOverlayInter #endif #endif // INTEL_INTRINSICS { + // pure C switch (bits_per_pixel) { case 8: blend_fn = overlay_blend_c_plane_masked; break; case 10: blend_fn = overlay_blend_c_plane_masked; break; case 12: blend_fn = overlay_blend_c_plane_masked; break; case 14: blend_fn = overlay_blend_c_plane_masked; break; case 16: blend_fn = overlay_blend_c_plane_masked; break; - case 32: blend_fn = overlay_blend_c_plane_masked_f; break; } - } - if (blend_fn == nullptr) - env->ThrowError("Blend: no valid internal function"); - - for (int p = planeindex_from; p <= planeindex_to; p++) { - blend_fn(base->GetPtrByIndex(p), overlay->GetPtrByIndex(p), mask->GetPtrByIndex(p), - base->GetPitchByIndex(p), overlay->GetPitchByIndex(p), mask->GetPitchByIndex(p), - (w >> base->xSubSamplingShifts[p]), h >> base->ySubSamplingShifts[p]); - } } else { - overlay_blend_plane_masked_opacity_t* blend_fn = nullptr; - + // specialized functions for non-full opacity #ifdef INTEL_INTRINSICS - if (pixelsize == 4 && (env->GetCPUFlags() & CPUF_SSE2)) { - blend_fn = overlay_blend_sse2_plane_masked_opacity_float; - } - else if (pixelsize == 2 && (env->GetCPUFlags() & CPUF_SSE4_1)) { + if (pixelsize == 2 && (env->GetCPUFlags() & CPUF_SSE4_1)) { switch (bits_per_pixel) { case 10: blend_fn = overlay_blend_sse41_plane_masked_opacity; break; @@ -153,9 +187,6 @@ void OL_BlendImage::BlendImageMask(ImageOverlayInternal* base, ImageOverlayInter case 16: blend_fn = overlay_blend_sse41_plane_masked_opacity; break; } } - else if (pixelsize == 1 && (env->GetCPUFlags() & CPUF_SSE4_1)) { - blend_fn = overlay_blend_sse41_plane_masked_opacity; - } else if (pixelsize == 1 && (env->GetCPUFlags() & CPUF_SSE2)) { blend_fn = overlay_blend_sse2_plane_masked_opacity; } @@ -174,18 +205,17 @@ void OL_BlendImage::BlendImageMask(ImageOverlayInternal* base, ImageOverlayInter case 12:blend_fn = overlay_blend_c_plane_masked_opacity; break; case 14:blend_fn = overlay_blend_c_plane_masked_opacity; break; case 16:blend_fn = overlay_blend_c_plane_masked_opacity; break; - case 32: blend_fn = overlay_blend_c_plane_masked_opacity_f; break; } } + } - if (blend_fn == nullptr) - env->ThrowError("Blend: no valid internal function"); + if (blend_fn == nullptr) + env->ThrowError("Blend: no valid internal function"); - for (int p = planeindex_from; p <= planeindex_to; p++) { - blend_fn(base->GetPtrByIndex(p), overlay->GetPtrByIndex(p), mask->GetPtrByIndex(p), - base->GetPitchByIndex(p), overlay->GetPitchByIndex(p), mask->GetPitchByIndex(p), - (w >> base->xSubSamplingShifts[p]), h >> base->ySubSamplingShifts[p], opacity, opacity_f); - } + for (int p = planeindex_from; p <= planeindex_to; p++) { + blend_fn(base->GetPtrByIndex(p), overlay->GetPtrByIndex(p), mask->GetPtrByIndex(p), + base->GetPitchByIndex(p), overlay->GetPitchByIndex(p), mask->GetPitchByIndex(p), + (w >> base->xSubSamplingShifts[p]), h >> base->ySubSamplingShifts[p], opacity, opacity_f); } } @@ -199,7 +229,7 @@ void OL_BlendImage::BlendImage(ImageOverlayInternal* base, ImageOverlayInternal* int planeindex_from = 0; int planeindex_to = 0; - if (of_mode == OF_Blend) { + if (of_mode == OF_Blend || of_mode == OF_Blend_Compat) { planeindex_from = 0; planeindex_to = greyscale ? 0 : 2; } @@ -220,46 +250,100 @@ void OL_BlendImage::BlendImage(ImageOverlayInternal* base, ImageOverlayInternal* } } else { - overlay_blend_plane_opacity_t* blend_fn = nullptr; + overlay_blend_plane_masked_opacity_t* blend_fn = nullptr; + + if (of_mode != OF_Blend_Compat || pixelsize == 4) { #ifdef INTEL_INTRINSICS - if (pixelsize == 4 && (env->GetCPUFlags() & CPUF_SSE2)) { - blend_fn = overlay_blend_sse2_plane_opacity_float; - } - else if (pixelsize == 2 && (env->GetCPUFlags() & CPUF_SSE4_1)) { - switch (bits_per_pixel) { - case 10: blend_fn = overlay_blend_sse41_plane_opacity_uint16<10>; break; - case 12: blend_fn = overlay_blend_sse41_plane_opacity_uint16<12>; break; - case 14: blend_fn = overlay_blend_sse41_plane_opacity_uint16<14>; break; - case 16: blend_fn = overlay_blend_sse41_plane_opacity_uint16<16>; break; + if (pixelsize == 4 && (env->GetCPUFlags() & CPUF_AVX2)) { + blend_fn = overlay_blend_avx2_float; } - } - else if (pixelsize == 1 && (env->GetCPUFlags() & CPUF_SSE2)) { - blend_fn = overlay_blend_sse2_plane_opacity; - } - else -#ifdef X86_32 - if (pixelsize == 1 && (env->GetCPUFlags() & CPUF_MMX)) { - blend_fn = overlay_blend_mmx_plane_opacity; + else if (pixelsize == 4 && (env->GetCPUFlags() & CPUF_SSE2)) { + blend_fn = overlay_blend_sse2_float; + } + else if (env->GetCPUFlags() & CPUF_AVX2) { + switch (bits_per_pixel) { + case 8: blend_fn = overlay_blend_avx2_uint; break; + case 10: blend_fn = overlay_blend_avx2_uint; break; + case 12: blend_fn = overlay_blend_avx2_uint; break; + case 14: blend_fn = overlay_blend_avx2_uint; break; + case 16: blend_fn = overlay_blend_avx2_uint; break; + } + } + else if (env->GetCPUFlags() & CPUF_SSE4_1) { + switch (bits_per_pixel) { + case 8: blend_fn = overlay_blend_sse41_uint; break; + case 10: blend_fn = overlay_blend_sse41_uint; break; + case 12: blend_fn = overlay_blend_sse41_uint; break; + case 14: blend_fn = overlay_blend_sse41_uint; break; + case 16: blend_fn = overlay_blend_sse41_uint; break; + } + } + else if (env->GetCPUFlags() & CPUF_SSE2) { + switch (bits_per_pixel) { + case 8: blend_fn = overlay_blend_sse2_uint; break; + case 10: blend_fn = overlay_blend_sse2_uint; break; + case 12: blend_fn = overlay_blend_sse2_uint; break; + case 14: blend_fn = overlay_blend_sse2_uint; break; + case 16: blend_fn = overlay_blend_sse2_uint; break; + } } else -#endif #endif // INTEL_INTRINSICS { + // pure C switch (bits_per_pixel) { - case 8: blend_fn = overlay_blend_c_plane_opacity; break; - case 10: blend_fn = overlay_blend_c_plane_opacity; break; - case 12: blend_fn = overlay_blend_c_plane_opacity; break; - case 14: blend_fn = overlay_blend_c_plane_opacity; break; - case 16: blend_fn = overlay_blend_c_plane_opacity; break; - case 32: blend_fn = overlay_blend_c_plane_opacity_f; break; + case 8: blend_fn = overlay_blend_c_uint; break; + case 10: blend_fn = overlay_blend_c_uint; break; + case 12: blend_fn = overlay_blend_c_uint; break; + case 14: blend_fn = overlay_blend_c_uint; break; + case 16: blend_fn = overlay_blend_c_uint; break; + case 32: blend_fn = overlay_blend_c_float; break; } } + // end of new, float precision inside masked overlays + } + else { + // old routies +#ifdef INTEL_INTRINSICS + if (pixelsize == 2 && (env->GetCPUFlags() & CPUF_SSE4_1)) { + switch (bits_per_pixel) { + case 10: blend_fn = overlay_blend_sse41_plane_opacity_uint16<10>; break; + case 12: blend_fn = overlay_blend_sse41_plane_opacity_uint16<12>; break; + case 14: blend_fn = overlay_blend_sse41_plane_opacity_uint16<14>; break; + case 16: blend_fn = overlay_blend_sse41_plane_opacity_uint16<16>; break; + } + } + else if (pixelsize == 1 && (env->GetCPUFlags() & CPUF_SSE2)) { + blend_fn = overlay_blend_sse2_plane_opacity; + } + else +#ifdef X86_32 + if (pixelsize == 1 && (env->GetCPUFlags() & CPUF_MMX)) { + blend_fn = overlay_blend_mmx_plane_opacity; + } + else +#endif +#endif // INTEL_INTRINSICS + { + switch (bits_per_pixel) { + case 8: blend_fn = overlay_blend_c_plane_opacity; break; + case 10: blend_fn = overlay_blend_c_plane_opacity; break; + case 12: blend_fn = overlay_blend_c_plane_opacity; break; + case 14: blend_fn = overlay_blend_c_plane_opacity; break; + case 16: blend_fn = overlay_blend_c_plane_opacity; break; + } + } + } if (blend_fn == nullptr) env->ThrowError("Blend: no valid internal function"); for (int p = planeindex_from; p <= planeindex_to; p++) { - blend_fn(base->GetPtrByIndex(p), overlay->GetPtrByIndex(p), base->GetPitchByIndex(p), overlay->GetPitchByIndex(p), (w >> base->xSubSamplingShifts[p]), h >> base->ySubSamplingShifts[p], opacity, opacity_f); + // no mask ptr + blend_fn( + base->GetPtrByIndex(p), overlay->GetPtrByIndex(p), nullptr, + base->GetPitchByIndex(p), overlay->GetPitchByIndex(p), 0, + (w >> base->xSubSamplingShifts[p]), h >> base->ySubSamplingShifts[p], opacity, opacity_f); } } diff --git a/avs_core/filters/overlay/blend_common.cpp b/avs_core/filters/overlay/blend_common.cpp index ddb27fbb78..05baf6f6dd 100644 --- a/avs_core/filters/overlay/blend_common.cpp +++ b/avs_core/filters/overlay/blend_common.cpp @@ -47,43 +47,123 @@ ********* Mode: Blend ******** ******************************/ -template -void overlay_blend_c_plane_masked(BYTE *p1, const BYTE *p2, const BYTE *mask, - const int p1_pitch, const int p2_pitch, const int mask_pitch, - const int width, const int height) +// 32 bit float mask calculation inside +template +void overlay_blend_c_uint(BYTE* p1, const BYTE* p2, const BYTE* mask, + const int p1_pitch, const int p2_pitch, const int mask_pitch, + const int width, const int height, const int opacity, const float opacity_f) { + const int max_pixel_value = (1 << bits_per_pixel) - 1; + auto factor = 1.0f / max_pixel_value; + factor = factor * opacity_f; + for (int y = 0; y < height; y++) { for (int x = 0; x < width; x++) { - int new_mask = reinterpret_cast(mask)[x]; - pixel_t p1x = reinterpret_cast(p1)[x]; - pixel_t p2x = reinterpret_cast(p2)[x]; - pixel_t result; - if constexpr(bits_per_pixel == 8) - result = (pixel_t)overlay_blend_c_core_8((BYTE)p1x, (BYTE)p2x, new_mask); - else - result = (pixel_t)overlay_blend_c_core_16((uint16_t)p1x, (uint16_t)p2x, new_mask); - reinterpret_cast(p1)[x] = result; + const float new_mask = has_mask ? (float)reinterpret_cast(mask)[x] * factor : factor; + auto result = overlay_blend_c_core_simple( + reinterpret_cast(p1)[x], + reinterpret_cast(p2)[x], + new_mask); + reinterpret_cast(p1)[x] = (pixel_t)(result + 0.5f); } - p1 += p1_pitch; - p2 += p2_pitch; - mask += mask_pitch; + p1 += p1_pitch; + p2 += p2_pitch; + if(has_mask) + mask += mask_pitch; } } -void overlay_blend_c_plane_masked_f(BYTE *p1, const BYTE *p2, const BYTE *mask, +// instantiate +// w/o mask +template void overlay_blend_c_uint(BYTE* p1, const BYTE* p2, const BYTE* mask, + const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f); +template void overlay_blend_c_uint(BYTE* p1, const BYTE* p2, const BYTE* mask, + const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f); +template void overlay_blend_c_uint(BYTE* p1, const BYTE* p2, const BYTE* mask, + const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f); +template void overlay_blend_c_uint(BYTE* p1, const BYTE* p2, const BYTE* mask, + const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f); +template void overlay_blend_c_uint(BYTE* p1, const BYTE* p2, const BYTE* mask, + const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f); +// w/ mask +template void overlay_blend_c_uint(BYTE* p1, const BYTE* p2, const BYTE* mask, + const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f); +template void overlay_blend_c_uint(BYTE* p1, const BYTE* p2, const BYTE* mask, + const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f); +template void overlay_blend_c_uint(BYTE* p1, const BYTE* p2, const BYTE* mask, + const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f); +template void overlay_blend_c_uint(BYTE* p1, const BYTE* p2, const BYTE* mask, + const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f); +template void overlay_blend_c_uint(BYTE* p1, const BYTE* p2, const BYTE* mask, + const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f); + +void overlay_blend_c_plane_masked_f(BYTE* p1, const BYTE* p2, const BYTE* mask, const int p1_pitch, const int p2_pitch, const int mask_pitch, - const int width, const int height) { + const int width, const int height, const int /*opacity*/, const float /*opacity_f*/) { typedef float pixel_t; for (int y = 0; y < height; y++) { for (int x = 0; x < width; x++) { - pixel_t new_mask = reinterpret_cast(mask)[x]; - pixel_t p1x = reinterpret_cast(p1)[x]; - pixel_t p2x = reinterpret_cast(p2)[x]; - pixel_t result = p1x + (p2x-p1x)*new_mask; // p1x*(1-new_mask) + p2x*mask + pixel_t new_mask = reinterpret_cast(mask)[x]; + pixel_t p1x = reinterpret_cast(p1)[x]; + pixel_t p2x = reinterpret_cast(p2)[x]; + pixel_t result = p1x + (p2x - p1x) * new_mask; // p1x*(1-new_mask) + p2x*mask //pixel_t result = overlay_blend_c_core(reinterpret_cast(p1)[x], reinterpret_cast(p2)[x], static_cast(reinterpret_cast(mask)[x])); + reinterpret_cast(p1)[x] = result; + } + + p1 += p1_pitch; + p2 += p2_pitch; + mask += mask_pitch; + } +} + +template +void overlay_blend_c_float(BYTE* p1, const BYTE* p2, const BYTE* mask, + const int p1_pitch, const int p2_pitch, const int mask_pitch, + const int width, const int height, const int /*opacity*/, const float opacity_f) { + + for (int y = 0; y < height; y++) { + for (int x = 0; x < width; x++) { + auto new_mask = has_mask ? reinterpret_cast(mask)[x] * opacity_f : opacity_f; + auto p1x = reinterpret_cast(p1)[x]; + auto p2x = reinterpret_cast(p2)[x]; + auto result = p1x + (p2x - p1x) * new_mask; // p1x*(1-new_mask) + p2x*mask + reinterpret_cast(p1)[x] = result; + } + + p1 += p1_pitch; + p2 += p2_pitch; + if constexpr (has_mask) + mask += mask_pitch; + } +} + +// instantiate +template void overlay_blend_c_float(BYTE* p1, const BYTE* p2, const BYTE* mask, + const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int /*opacity*/, const float opacity_f); +template void overlay_blend_c_float(BYTE* p1, const BYTE* p2, const BYTE* mask, + const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int /*opacity*/, const float opacity_f); + + + +template +void overlay_blend_c_plane_masked(BYTE *p1, const BYTE *p2, const BYTE *mask, + const int p1_pitch, const int p2_pitch, const int mask_pitch, + const int width, const int height, const int /*opacity*/, const float /*opacity_f*/) +{ + for (int y = 0; y < height; y++) { + for (int x = 0; x < width; x++) { + int new_mask = reinterpret_cast(mask)[x]; + pixel_t p1x = reinterpret_cast(p1)[x]; + pixel_t p2x = reinterpret_cast(p2)[x]; + pixel_t result; + if constexpr(bits_per_pixel == 8) + result = (pixel_t)overlay_blend_c_core_8((BYTE)p1x, (BYTE)p2x, new_mask); + else + result = (pixel_t)overlay_blend_c_core_16((uint16_t)p1x, (uint16_t)p2x, new_mask); reinterpret_cast(p1)[x] = result; } @@ -93,27 +173,28 @@ void overlay_blend_c_plane_masked_f(BYTE *p1, const BYTE *p2, const BYTE *mask, } } + // instantiate template void overlay_blend_c_plane_masked(BYTE *p1, const BYTE *p2, const BYTE *mask, const int p1_pitch, const int p2_pitch, const int mask_pitch, - const int width, const int height); + const int width, const int height, const int opacity, const float opacity_f); template void overlay_blend_c_plane_masked(BYTE *p1, const BYTE *p2, const BYTE *mask, const int p1_pitch, const int p2_pitch, const int mask_pitch, - const int width, const int height); + const int width, const int heigh, const int opacity, const float opacity_ft); template void overlay_blend_c_plane_masked(BYTE *p1, const BYTE *p2, const BYTE *mask, const int p1_pitch, const int p2_pitch, const int mask_pitch, - const int width, const int height); + const int width, const int height, const int opacity, const float opacity_f); template void overlay_blend_c_plane_masked(BYTE *p1, const BYTE *p2, const BYTE *mask, const int p1_pitch, const int p2_pitch, const int mask_pitch, - const int width, const int height); + const int width, const int height, const int opacity, const float opacity_f); template void overlay_blend_c_plane_masked(BYTE *p1, const BYTE *p2, const BYTE *mask, const int p1_pitch, const int p2_pitch, const int mask_pitch, - const int width, const int height); + const int width, const int height, const int opacity, const float opacity_f); template -void overlay_blend_c_plane_opacity(BYTE *p1, const BYTE *p2, - const int p1_pitch, const int p2_pitch, +void overlay_blend_c_plane_opacity(BYTE *p1, const BYTE *p2, const BYTE* /*mask*/, + const int p1_pitch, const int p2_pitch, const int /*mask_pitch*/, const int width, const int height, const int opacity, const float opacity_f) { AVS_UNUSED(opacity_f); @@ -140,8 +221,8 @@ void overlay_blend_c_plane_opacity(BYTE *p1, const BYTE *p2, } } -void overlay_blend_c_plane_opacity_f(BYTE *p1, const BYTE *p2, - const int p1_pitch, const int p2_pitch, +void overlay_blend_c_plane_opacity_f(BYTE *p1, const BYTE *p2, const BYTE* /*mask*/, + const int p1_pitch, const int p2_pitch, const int /*mask_pitch*/, const int width, const int height,const int opacity, const float opacity_f) { AVS_UNUSED(opacity); @@ -160,20 +241,20 @@ void overlay_blend_c_plane_opacity_f(BYTE *p1, const BYTE *p2, } // instantiate -template void overlay_blend_c_plane_opacity(BYTE *p1, const BYTE *p2, - const int p1_pitch, const int p2_pitch, +template void overlay_blend_c_plane_opacity(BYTE *p1, const BYTE *p2, const BYTE* mask, + const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f); -template void overlay_blend_c_plane_opacity(BYTE *p1, const BYTE *p2, - const int p1_pitch, const int p2_pitch, +template void overlay_blend_c_plane_opacity(BYTE *p1, const BYTE *p2, const BYTE* mask, + const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f); -template void overlay_blend_c_plane_opacity(BYTE *p1, const BYTE *p2, - const int p1_pitch, const int p2_pitch, +template void overlay_blend_c_plane_opacity(BYTE *p1, const BYTE *p2, const BYTE* mask, + const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f); -template void overlay_blend_c_plane_opacity(BYTE *p1, const BYTE *p2, - const int p1_pitch, const int p2_pitch, +template void overlay_blend_c_plane_opacity(BYTE *p1, const BYTE *p2, const BYTE* mask, + const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f); -template void overlay_blend_c_plane_opacity(BYTE *p1, const BYTE *p2, - const int p1_pitch, const int p2_pitch, +template void overlay_blend_c_plane_opacity(BYTE *p1, const BYTE *p2, const BYTE* mask, + const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f); diff --git a/avs_core/filters/overlay/blend_common.h b/avs_core/filters/overlay/blend_common.h index ed95a0131f..258276036f 100644 --- a/avs_core/filters/overlay/blend_common.h +++ b/avs_core/filters/overlay/blend_common.h @@ -40,14 +40,6 @@ #include #include -using overlay_blend_plane_masked_t = void(BYTE* p1, const BYTE* p2, const BYTE* mask, - const int p1_pitch, const int p2_pitch, const int mask_pitch, - const int width, const int height); - -using overlay_blend_plane_opacity_t = void(BYTE* p1, const BYTE* p2, - const int p1_pitch, const int p2_pitch, - const int width, const int height, const int opacity, const float opacity_f); - using overlay_blend_plane_masked_opacity_t = void(BYTE* p1, const BYTE* p2, const BYTE* mask, const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f); @@ -79,6 +71,12 @@ AVS_FORCEINLINE static uint16_t overlay_blend_c_core_16(const uint16_t p1, const return (uint16_t)(((p1 << bits_per_pixel) + (p2 - p1)*mask + half_rounder) >> bits_per_pixel); } +AVS_FORCEINLINE static float overlay_blend_c_core_simple(const int p1, const int p2, const float factor) { + // p1*(1-mask_f) + p2*mask_f -> p1 + (p2-p1)*mask_f + const float res = p1 + (p2 - p1) * factor; + return res; +} + AVS_FORCEINLINE static float overlay_blend_c_core_f(const float p1, const float p2, const float mask) { return p1 + (p2-p1)*mask; // p1*(1-mask) + p2*mask } @@ -103,19 +101,29 @@ AVS_FORCEINLINE pixel_t overlay_blend_opaque_c_core(const pixel_t p1, const pixe // Mode: Overlay void overlay_blend_c_plane_masked_f(BYTE *p1, const BYTE *p2, const BYTE *mask, const int p1_pitch, const int p2_pitch, const int mask_pitch, - const int width, const int height); + const int width, const int height, const int opacity, const float opacity_f); template void overlay_blend_c_plane_masked(BYTE *p1, const BYTE *p2, const BYTE *mask, const int p1_pitch, const int p2_pitch, const int mask_pitch, - const int width, const int height); + const int width, const int height, const int opacity, const float opacity_f); + +template +void overlay_blend_c_uint(BYTE* p1, const BYTE* p2, const BYTE* mask, + const int p1_pitch, const int p2_pitch, const int mask_pitch, + const int width, const int height, const int opacity, const float opacity_f); + +template +void overlay_blend_c_float(BYTE* p1, const BYTE* p2, const BYTE* mask, + const int p1_pitch, const int p2_pitch, const int mask_pitch, + const int width, const int height, const int opacity, const float opacity_f); template -void overlay_blend_c_plane_opacity(BYTE *p1, const BYTE *p2, - const int p1_pitch, const int p2_pitch, +void overlay_blend_c_plane_opacity(BYTE *p1, const BYTE *p2, const BYTE* mask, + const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f); -void overlay_blend_c_plane_opacity_f(BYTE *p1, const BYTE *p2, - const int p1_pitch, const int p2_pitch, +void overlay_blend_c_plane_opacity_f(BYTE *p1, const BYTE *p2, const BYTE* mask, + const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f); template diff --git a/avs_core/filters/overlay/intel/blend_common_avx2.cpp b/avs_core/filters/overlay/intel/blend_common_avx2.cpp new file mode 100644 index 0000000000..1d88c735b9 --- /dev/null +++ b/avs_core/filters/overlay/intel/blend_common_avx2.cpp @@ -0,0 +1,230 @@ +// Avisynth+ +// https://avs-plus.net +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA, or visit +// http://www.gnu.org/copyleft/gpl.html . +// +// Linking Avisynth statically or dynamically with other modules is making a +// combined work based on Avisynth. Thus, the terms and conditions of the GNU +// General Public License cover the whole combination. +// +// As a special exception, the copyright holders of Avisynth give you +// permission to link Avisynth with independent modules that communicate with +// Avisynth solely through the interfaces defined in avisynth.h, regardless of the license +// terms of these independent modules, and to copy and distribute the +// resulting combined work under terms of your choice, provided that +// every copy of the combined work is accompanied by a complete copy of +// the source code of Avisynth (the version of Avisynth used to produce the +// combined work), being distributed under the terms of the GNU General +// Public License plus this exception. An independent module is a module +// which is not derived from or based on Avisynth, such as 3rd-party filters, +// import and export plugins, or graphical user interfaces. + +#include "avisynth.h" +#include "blend_common_avx2.h" +#include "../blend_common.h" + +#include + +#ifdef AVS_WINDOWS +#include +#else +#include +#endif + +template +static AVS_FORCEINLINE __m256 Eightpixels_to_floats(const pixel_t* src) { + __m256i srci; + if constexpr (sizeof(pixel_t) == 1) { + srci = _mm256_cvtepu8_epi32(_mm_loadl_epi64(reinterpret_cast(src))); + } + else { + srci = _mm256_cvtepu16_epi32(_mm_loadu_si128(reinterpret_cast(src))); + } + return _mm256_cvtepi32_ps(srci); +} + +template +static AVS_FORCEINLINE void Store_Eightpixels(pixel_t* dst, __m256 what, const __m256 rounder) { + what = _mm256_add_ps(what, rounder); // round + __m256i si32 = _mm256_cvttps_epi32(what); // truncate + __m256i result = _mm256_packus_epi32(si32, si32); // only low 8 words needed + result = _mm256_permute4x64_epi64(result, (0 << 0) | (2 << 2) | (1 << 4) | (3 << 6)); + __m128i result128 = _mm256_castsi256_si128(result); + if constexpr (sizeof(pixel_t) == 1) { + __m128i result64 = _mm_packus_epi16(result128, result128); + _mm_storel_epi64(reinterpret_cast<__m128i*>(dst), result64); + } else { + /* when mask is 0..1 checked then this is not possible + if constexpr (bits_per_pixel < 16) { // otherwise no clamp needed + constexpr int max_pixel_value = (1 << bits_per_pixel) - 1; + auto max_pixel_value_v = _mm_set1_epi16(static_cast(max_pixel_value)); + result128 = _mm_min_epu16(result128, max_pixel_value_v); + } + */ + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), result128); + } +} + +AVS_FORCEINLINE static __m256 overlay_blend_avx2_core_new(const __m256& p1_f, const __m256& p2_f, const __m256& factor) { + /* + // p1*(1-mask_f) + p2*mask_f -> p1 + (p2-p1)*mask_f + constexpr int max_pixel_value = (1 << bits_per_pixel) - 1; + constexpr float factor = 1.0f / max_pixel_value; + constexpr float half_rounder = 0.5f; + const float mask_f = mask * factor; + const float res = p1 + (p2 - p1) * mask_f; + int result = (int)(res + 0.5f); + */ + // rounding not here, but before storage + auto res = _mm256_add_ps(p1_f, _mm256_mul_ps(_mm256_sub_ps(p2_f, p1_f), factor)); + return res; +} + +template +void overlay_blend_avx2_uint(BYTE* p1, const BYTE* p2, const BYTE* mask, + const int p1_pitch, const int p2_pitch, const int mask_pitch, + const int width, const int height, const int opacity, const float opacity_f) +{ + + auto rounder = _mm256_set1_ps(0.5f); + const int max_pixel_value = (1 << bits_per_pixel) - 1; + auto factor = 1.0f / max_pixel_value; + factor = factor * opacity_f; + auto factor_v = _mm256_set1_ps(factor); + + const int realwidth = width * sizeof(pixel_t); + + // 2x8 pixels at a time + constexpr int bytes_per_cycle = 16 * sizeof(pixel_t); + int wMod16 = (realwidth / bytes_per_cycle) * bytes_per_cycle; + + for (int y = 0; y < height; y++) { + for (int x = 0; x < wMod16; x += bytes_per_cycle) { + auto unpacked_p1 = Eightpixels_to_floats((const pixel_t*)(p1 + x)); // 8x32 + auto unpacked_p2 = Eightpixels_to_floats((const pixel_t*)(p2 + x)); // 8x32 + + auto unpacked_p1_2 = Eightpixels_to_floats((const pixel_t*)(p1 + x + bytes_per_cycle / 2)); // 8x32 + auto unpacked_p2_2 = Eightpixels_to_floats((const pixel_t*)(p2 + x + bytes_per_cycle / 2)); // 8x32 + + __m256 result, result_2; + if constexpr (has_mask) { + auto unpacked_mask = Eightpixels_to_floats((const pixel_t*)(mask + x)); // 8x32 + unpacked_mask = _mm256_mul_ps(unpacked_mask, factor_v); + result = overlay_blend_avx2_core_new(unpacked_p1, unpacked_p2, unpacked_mask); + + auto unpacked_mask_2 = Eightpixels_to_floats((const pixel_t*)(mask + x + bytes_per_cycle / 2)); // 8x32 + unpacked_mask_2 = _mm256_mul_ps(unpacked_mask_2, factor_v); + result_2 = overlay_blend_avx2_core_new(unpacked_p1_2, unpacked_p2_2, unpacked_mask_2); + } + else { + result = overlay_blend_avx2_core_new(unpacked_p1, unpacked_p2, factor_v); + result_2 = overlay_blend_avx2_core_new(unpacked_p1_2, unpacked_p2_2, factor_v); + } + + Store_Eightpixels((pixel_t*)(p1 + x), result, rounder); + Store_Eightpixels((pixel_t*)(p1 + x + bytes_per_cycle / 2), result_2, rounder); + } + + // Leftover value + + for (int x = wMod16 / sizeof(pixel_t); x < width; x++) { + const float new_factor = has_mask ? static_cast(reinterpret_cast(mask)[x]) * factor : factor; + auto result = overlay_blend_c_core_simple(reinterpret_cast(p1)[x], reinterpret_cast(p2)[x], new_factor); + reinterpret_cast(p1)[x] = (pixel_t)(result + 0.5f); + } + + p1 += p1_pitch; + p2 += p2_pitch; + if (has_mask) + mask += mask_pitch; + } +} + +// instantiate +// mask yes/no +template void overlay_blend_avx2_uint(BYTE* p1, const BYTE* p2, const BYTE* mask, + const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f); +template void overlay_blend_avx2_uint(BYTE* p1, const BYTE* p2, const BYTE* mask, + const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f); +template void overlay_blend_avx2_uint(BYTE* p1, const BYTE* p2, const BYTE* mask, + const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f); +template void overlay_blend_avx2_uint(BYTE* p1, const BYTE* p2, const BYTE* mask, + const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f); +template void overlay_blend_avx2_uint(BYTE* p1, const BYTE* p2, const BYTE* mask, + const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f); +//-- +template void overlay_blend_avx2_uint(BYTE* p1, const BYTE* p2, const BYTE* mask, + const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f); +template void overlay_blend_avx2_uint(BYTE* p1, const BYTE* p2, const BYTE* mask, + const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f); +template void overlay_blend_avx2_uint(BYTE* p1, const BYTE* p2, const BYTE* mask, + const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f); +template void overlay_blend_avx2_uint(BYTE* p1, const BYTE* p2, const BYTE* mask, + const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f); +template void overlay_blend_avx2_uint(BYTE* p1, const BYTE* p2, const BYTE* mask, + const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f); + +template +void overlay_blend_avx2_float(BYTE* p1, const BYTE* p2, const BYTE* mask, + const int p1_pitch, const int p2_pitch, const int mask_pitch, + const int width, const int height, const int opacity, const float opacity_f) +{ + + const int realwidth = width * sizeof(float); + + int wMod32 = (realwidth / 32) * 32; + auto opacity_v = _mm256_set1_ps(opacity_f); + + for (int y = 0; y < height; y++) { + for (int x = 0; x < wMod32; x += 32) { + auto p1_f = _mm256_loadu_ps(reinterpret_cast(p1 + x)); + auto p2_f = _mm256_loadu_ps(reinterpret_cast(p2 + x)); + __m256 new_mask; + if constexpr (has_mask) { + new_mask = _mm256_loadu_ps(reinterpret_cast(mask + x)); + new_mask = _mm256_mul_ps(new_mask, opacity_v); + } + else { + new_mask = opacity_v; + } + auto result = _mm256_add_ps(p1_f, _mm256_mul_ps(_mm256_sub_ps(p2_f, p1_f), new_mask)); // p1*(1-mask) + p2*mask = p1+(p2-p1)*mask + + _mm256_storeu_ps(reinterpret_cast(p1 + x), result); + } + + // Leftover value + + for (int x = wMod32 / sizeof(float); x < width; x++) { + auto new_mask = has_mask ? reinterpret_cast(mask)[x] * opacity_f : opacity_f; + auto p1x = reinterpret_cast(p1)[x]; + auto p2x = reinterpret_cast(p2)[x]; + auto result = p1x + (p2x - p1x) * new_mask; // p1x*(1-new_mask) + p2x*mask + reinterpret_cast(p1)[x] = result; + } + + + p1 += p1_pitch; + p2 += p2_pitch; + if constexpr (has_mask) + mask += mask_pitch; + } +} + +template void overlay_blend_avx2_float(BYTE* p1, const BYTE* p2, const BYTE* mask, + const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f); +template void overlay_blend_avx2_float(BYTE* p1, const BYTE* p2, const BYTE* mask, + const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f); + diff --git a/avs_core/filters/overlay/intel/blend_common_avx2.h b/avs_core/filters/overlay/intel/blend_common_avx2.h new file mode 100644 index 0000000000..e3948c19f2 --- /dev/null +++ b/avs_core/filters/overlay/intel/blend_common_avx2.h @@ -0,0 +1,40 @@ +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA, or visit +// http://www.gnu.org/copyleft/gpl.html . +// +// Linking Avisynth statically or dynamically with other modules is making a +// combined work based on Avisynth. Thus, the terms and conditions of the GNU +// General Public License cover the whole combination. +// +// As a special exception, the copyright holders of Avisynth give you +// permission to link Avisynth with independent modules that communicate with +// Avisynth solely through the interfaces defined in avisynth.h, regardless of the license +// terms of these independent modules, and to copy and distribute the +// resulting combined work under terms of your choice, provided that +// every copy of the combined work is accompanied by a complete copy of +// the source code of Avisynth (the version of Avisynth used to produce the +// combined work), being distributed under the terms of the GNU General +// Public License plus this exception. An independent module is a module +// which is not derived from or based on Avisynth, such as 3rd-party filters, +// import and export plugins, or graphical user interfaces. + +#include "avisynth.h" +#include + +#ifndef __blend_common_avx2_h +#define __blend_common_avx2_h + +template +void overlay_blend_avx2_uint(BYTE* p1, const BYTE* p2, const BYTE* mask, + const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f); + +template +void overlay_blend_avx2_float(BYTE* p1, const BYTE* p2, const BYTE* mask, + const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f); + +#endif // __blend_common_avx2_h diff --git a/avs_core/filters/overlay/intel/blend_common_sse.cpp b/avs_core/filters/overlay/intel/blend_common_sse.cpp index 8db35b5ff4..731765f475 100644 --- a/avs_core/filters/overlay/intel/blend_common_sse.cpp +++ b/avs_core/filters/overlay/intel/blend_common_sse.cpp @@ -38,6 +38,7 @@ #include "blend_common_sse.h" #include "../blend_common.h" +#include "../../../core/internal.h" // Intrinsics for SSE4.1, SSSE3, SSE3, SSE2, ISSE and MMX #include @@ -126,11 +127,6 @@ AVS_FORCEINLINE static __m128i overlay_merge_mask_sse41_uint16(const __m128i& p1 return t2; } -AVS_FORCEINLINE static __m128i overlay_merge_mask_sse2_float(const __m128i& p1, const __m128i& p2) { - __m128 mulres = _mm_mul_ps(_mm_castsi128_ps(p1), _mm_castsi128_ps(p2)); - return _mm_castps_si128(mulres); -} - /******************************** ********* Blend Opaque ********* @@ -159,7 +155,7 @@ AVS_FORCEINLINE __m128i overlay_blend_opaque_sse2_core(const __m128i& p1, const #ifdef X86_32 void overlay_blend_mmx_plane_masked(BYTE *p1, const BYTE *p2, const BYTE *mask, const int p1_pitch, const int p2_pitch, const int mask_pitch, - const int width, const int height) { + const int width, const int height, const int opacity, const float opacity_f) { BYTE* original_p1 = p1; const BYTE* original_p2 = p2; const BYTE* original_mask = mask; @@ -211,17 +207,10 @@ static AVS_FORCEINLINE __m128i _MM_BLENDV_EPI8(__m128i const &a, __m128i const & return _mm_or_si128(_mm_and_si128(selector, b), _mm_andnot_si128(selector, a)); } -// non-existant in simd -static AVS_FORCEINLINE __m128i _MM_CMPLE_EPU16(__m128i x, __m128i y) -{ - // Returns 0xFFFF where x <= y: - return _mm_cmpeq_epi16(_mm_subs_epu16(x, y), _mm_setzero_si128()); -} - // uint8_t only void overlay_blend_sse2_plane_masked(BYTE *p1, const BYTE *p2, const BYTE *mask, const int p1_pitch, const int p2_pitch, const int mask_pitch, - const int width, const int height) + const int width, const int height, const int opacity, const float opacity_f) { __m128i v128; v128 = _mm_set1_epi16(0x0080); // rounder @@ -288,7 +277,7 @@ __attribute__((__target__("sse4.1"))) #endif void overlay_blend_sse41_plane_masked(BYTE *p1, const BYTE *p2, const BYTE *mask, const int p1_pitch, const int p2_pitch, const int mask_pitch, - const int width, const int height) + const int width, const int height, const int opacity, const float opacity_f) { __m128i v128; if constexpr (sizeof(pixel_t) == 1) @@ -397,45 +386,344 @@ void overlay_blend_sse41_plane_masked(BYTE *p1, const BYTE *p2, const BYTE *mask } } -void overlay_blend_sse2_plane_masked_float(BYTE *p1, const BYTE *p2, const BYTE *mask, +// instantiate +template void overlay_blend_sse41_plane_masked(BYTE* p1, const BYTE* p2, const BYTE* mask, const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f); +// 16 bit: SSE4 only +template void overlay_blend_sse41_plane_masked(BYTE* p1, const BYTE* p2, const BYTE* mask, const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f); +template void overlay_blend_sse41_plane_masked(BYTE* p1, const BYTE* p2, const BYTE* mask, const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f); +template void overlay_blend_sse41_plane_masked(BYTE* p1, const BYTE* p2, const BYTE* mask, const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f); +template void overlay_blend_sse41_plane_masked(BYTE* p1, const BYTE* p2, const BYTE* mask, const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f); + + +template +#if defined(GCC) || defined(CLANG) +__attribute__((__target__("sse4.1"))) +#endif +static AVS_FORCEINLINE void Eightpixels_to_Eightfloats(const pixel_t* src, __m128& src_lo, __m128& src_hi, __m128i& zero) { + __m128i srci; + if constexpr (sizeof(pixel_t) == 1) { + srci = _mm_loadl_epi64(reinterpret_cast(src)); + srci = _mm_unpacklo_epi8(srci, zero); + } + else { + srci = _mm_loadu_si128(reinterpret_cast(src)); + } + src_lo = _mm_cvtepi32_ps(_mm_cvtepu16_epi32(srci)); + src_hi = _mm_cvtepi32_ps(_mm_unpackhi_epi16(srci, zero)); +} + +template +#if defined(GCC) || defined(CLANG) +__attribute__((__target__("sse4.1"))) +#endif +static AVS_FORCEINLINE void Store_Eightpixels(pixel_t* dst, __m128 what_lo, __m128 what_hi, const __m128 rounder) { + what_lo = _mm_add_ps(what_lo, rounder); // round + what_hi = _mm_add_ps(what_hi, rounder); // round + auto si32_lo = _mm_cvttps_epi32(what_lo); // truncate + auto si32_hi = _mm_cvttps_epi32(what_hi); // truncate + auto result = _mm_packus_epi32(si32_lo, si32_hi); // 2x4x32bit -> 8x16 + if constexpr (sizeof(pixel_t) == 1) { + __m128i result64 = _mm_packus_epi16(result, result); // 8x16bit -> 8x8 + _mm_storel_epi64(reinterpret_cast<__m128i*>(dst), result64); + } + else { + /* when mask is 0..1 checked then this is not possible + if constexpr (bits_per_pixel < 16) { // otherwise no clamp needed + constexpr int max_pixel_value = (1 << bits_per_pixel) - 1; + auto max_pixel_value_v = _mm_set1_epi16(static_cast(max_pixel_value)); + result128 = _mm_min_epu16(result128, max_pixel_value_v); + } + */ + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), result); + } +} + +template +#if defined(GCC) || defined(CLANG) +__attribute__((__target__("sse2"))) +#endif +static AVS_FORCEINLINE void Eightpixels_to_Eightfloats_sse2(const pixel_t* src, __m128& src_lo, __m128& src_hi, __m128i& zero) { + __m128i srci; + if constexpr (sizeof(pixel_t) == 1) { + srci = _mm_loadl_epi64(reinterpret_cast(src)); + srci = _mm_unpacklo_epi8(srci, zero); + } + else { + srci = _mm_loadu_si128(reinterpret_cast(src)); + } + src_lo = _mm_cvtepi32_ps(_mm_unpacklo_epi16(srci, zero)); + src_hi = _mm_cvtepi32_ps(_mm_unpackhi_epi16(srci, zero)); +} + +template +#if defined(GCC) || defined(CLANG) +__attribute__((__target__("sse2"))) +#endif +static AVS_FORCEINLINE void Store_Eightpixels_sse2(pixel_t* dst, __m128 what_lo, __m128 what_hi, const __m128 rounder) { + what_lo = _mm_add_ps(what_lo, rounder); // round + what_hi = _mm_add_ps(what_hi, rounder); // round + auto si32_lo = _mm_cvttps_epi32(what_lo); // truncate + auto si32_hi = _mm_cvttps_epi32(what_hi); // truncate + if constexpr (sizeof(pixel_t) == 1) { + auto result = _mm_packs_epi32(si32_lo, si32_hi); // 2x4x32bit -> 8x16 + __m128i result64 = _mm_packus_epi16(result, result); // 8x16bit -> 8x8 + _mm_storel_epi64(reinterpret_cast<__m128i*>(dst), result64); + } + else { + auto result = _MM_PACKUS_EPI32(si32_lo, si32_hi); // 2x4x32bit -> 8x16 + /* when mask is 0..1 checked then this is not possible + if constexpr (bits_per_pixel < 16) { // otherwise no clamp needed + constexpr int max_pixel_value = (1 << bits_per_pixel) - 1; + auto max_pixel_value_v = _mm_set1_epi16(static_cast(max_pixel_value)); + result128 = _mm_min_epu16(result128, max_pixel_value_v); + } + */ + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), result); + } +} + + + +AVS_FORCEINLINE static __m128 overlay_blend_sse_core_new(const __m128& p1_f, const __m128& p2_f, const __m128& factor) { + /* + // p1*(1-mask_f) + p2*mask_f -> p1 + (p2-p1)*mask_f + constexpr int max_pixel_value = (1 << bits_per_pixel) - 1; + constexpr float factor = 1.0f / max_pixel_value; + constexpr float half_rounder = 0.5f; + const float mask_f = mask * factor; + const float res = p1 + (p2 - p1) * mask_f; + int result = (int)(res + 0.5f); + */ + // rounding not here, but before storage + auto res = _mm_add_ps(p1_f, _mm_mul_ps(_mm_sub_ps(p2_f, p1_f), factor)); + return res; +} + +template +#if defined(GCC) || defined(CLANG) +__attribute__((__target__("sse4.1"))) +#endif +void overlay_blend_sse41_uint(BYTE* p1, const BYTE* p2, const BYTE* mask, + const int p1_pitch, const int p2_pitch, const int mask_pitch, + const int width, const int height, const int opacity, const float opacity_f) +{ + + auto rounder = _mm_set1_ps(0.5f); + const int max_pixel_value = (1 << bits_per_pixel) - 1; + auto factor = 1.0f / max_pixel_value; + factor = factor * opacity_f; + auto factor_v = _mm_set1_ps(factor); + + const int realwidth = width * sizeof(pixel_t); + + // 8 pixels at a time + constexpr int bytes_per_cycle = 8 * sizeof(pixel_t); + int wMod8 = (realwidth / bytes_per_cycle) * bytes_per_cycle; + + auto zero = _mm_setzero_si128(); + + for (int y = 0; y < height; y++) { + for (int x = 0; x < wMod8; x += bytes_per_cycle) { + __m128 unpacked_p1, unpacked_p1_2; + __m128 unpacked_p2, unpacked_p2_2; + Eightpixels_to_Eightfloats((const pixel_t*)(p1 + x), unpacked_p1, unpacked_p1_2, zero); // 8x32 + Eightpixels_to_Eightfloats((const pixel_t*)(p2 + x), unpacked_p2, unpacked_p2_2, zero); // 8x32 + + __m128 result, result_2; + if constexpr (has_mask) { + __m128 unpacked_mask, unpacked_mask_2; + Eightpixels_to_Eightfloats((const pixel_t*)(mask + x), unpacked_mask, unpacked_mask_2, zero); // 8x32 + unpacked_mask = _mm_mul_ps(unpacked_mask, factor_v); + unpacked_mask_2 = _mm_mul_ps(unpacked_mask_2, factor_v); + result = overlay_blend_sse_core_new(unpacked_p1, unpacked_p2, unpacked_mask); + result_2 = overlay_blend_sse_core_new(unpacked_p1_2, unpacked_p2_2, unpacked_mask_2); + } + else { + result = overlay_blend_sse_core_new(unpacked_p1, unpacked_p2, factor_v); + result_2 = overlay_blend_sse_core_new(unpacked_p1_2, unpacked_p2_2, factor_v); + } + + Store_Eightpixels((pixel_t*)(p1 + x), result, result_2, rounder); + } + + // Leftover value + + for (int x = wMod8 / sizeof(pixel_t); x < width; x++) { + const float new_factor = has_mask ? static_cast(reinterpret_cast(mask)[x]) * factor : factor; + auto result = overlay_blend_c_core_simple(reinterpret_cast(p1)[x], reinterpret_cast(p2)[x], new_factor); + reinterpret_cast(p1)[x] = (pixel_t)(result + 0.5f); + } + + p1 += p1_pitch; + p2 += p2_pitch; + if (has_mask) + mask += mask_pitch; + } +} + +// instantiate +// mask yes/no +template void overlay_blend_sse41_uint(BYTE* p1, const BYTE* p2, const BYTE* mask, + const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f); +template void overlay_blend_sse41_uint(BYTE* p1, const BYTE* p2, const BYTE* mask, + const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f); +template void overlay_blend_sse41_uint(BYTE* p1, const BYTE* p2, const BYTE* mask, + const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f); +template void overlay_blend_sse41_uint(BYTE* p1, const BYTE* p2, const BYTE* mask, + const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f); +template void overlay_blend_sse41_uint(BYTE* p1, const BYTE* p2, const BYTE* mask, + const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f); +//-- +template void overlay_blend_sse41_uint(BYTE* p1, const BYTE* p2, const BYTE* mask, + const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f); +template void overlay_blend_sse41_uint(BYTE* p1, const BYTE* p2, const BYTE* mask, + const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f); +template void overlay_blend_sse41_uint(BYTE* p1, const BYTE* p2, const BYTE* mask, + const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f); +template void overlay_blend_sse41_uint(BYTE* p1, const BYTE* p2, const BYTE* mask, + const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f); +template void overlay_blend_sse41_uint(BYTE* p1, const BYTE* p2, const BYTE* mask, + const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f); + +template +#if defined(GCC) || defined(CLANG) +__attribute__((__target__("sse2"))) +#endif +void overlay_blend_sse2_uint(BYTE* p1, const BYTE* p2, const BYTE* mask, + const int p1_pitch, const int p2_pitch, const int mask_pitch, + const int width, const int height, const int opacity, const float opacity_f) +{ + + auto rounder = _mm_set1_ps(0.5f); + const int max_pixel_value = (1 << bits_per_pixel) - 1; + auto factor = 1.0f / max_pixel_value; + factor = factor * opacity_f; + auto factor_v = _mm_set1_ps(factor); + + const int realwidth = width * sizeof(pixel_t); + + // 8 pixels at a time + constexpr int bytes_per_cycle = 8 * sizeof(pixel_t); + int wMod8 = (realwidth / bytes_per_cycle) * bytes_per_cycle; + + auto zero = _mm_setzero_si128(); + + for (int y = 0; y < height; y++) { + for (int x = 0; x < wMod8; x += bytes_per_cycle) { + __m128 unpacked_p1, unpacked_p1_2; + __m128 unpacked_p2, unpacked_p2_2; + Eightpixels_to_Eightfloats_sse2((const pixel_t*)(p1 + x), unpacked_p1, unpacked_p1_2, zero); // 8x32 + Eightpixels_to_Eightfloats_sse2((const pixel_t*)(p2 + x), unpacked_p2, unpacked_p2_2, zero); // 8x32 + + __m128 result, result_2; + if constexpr (has_mask) { + __m128 unpacked_mask, unpacked_mask_2; + Eightpixels_to_Eightfloats_sse2((const pixel_t*)(mask + x), unpacked_mask, unpacked_mask_2, zero); // 8x32 + unpacked_mask = _mm_mul_ps(unpacked_mask, factor_v); + unpacked_mask_2 = _mm_mul_ps(unpacked_mask_2, factor_v); + result = overlay_blend_sse_core_new(unpacked_p1, unpacked_p2, unpacked_mask); + result_2 = overlay_blend_sse_core_new(unpacked_p1_2, unpacked_p2_2, unpacked_mask_2); + } + else { + result = overlay_blend_sse_core_new(unpacked_p1, unpacked_p2, factor_v); + result_2 = overlay_blend_sse_core_new(unpacked_p1_2, unpacked_p2_2, factor_v); + } + + Store_Eightpixels_sse2((pixel_t*)(p1 + x), result, result_2, rounder); + } + + // Leftover value + + for (int x = wMod8 / sizeof(pixel_t); x < width; x++) { + const float new_factor = has_mask ? static_cast(reinterpret_cast(mask)[x]) * factor : factor; + auto result = overlay_blend_c_core_simple(reinterpret_cast(p1)[x], reinterpret_cast(p2)[x], new_factor); + reinterpret_cast(p1)[x] = (pixel_t)(result + 0.5f); + } + + p1 += p1_pitch; + p2 += p2_pitch; + if (has_mask) + mask += mask_pitch; + } +} + +// instantiate +// mask yes/no +template void overlay_blend_sse2_uint(BYTE* p1, const BYTE* p2, const BYTE* mask, + const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f); +template void overlay_blend_sse2_uint(BYTE* p1, const BYTE* p2, const BYTE* mask, + const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f); +template void overlay_blend_sse2_uint(BYTE* p1, const BYTE* p2, const BYTE* mask, + const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f); +template void overlay_blend_sse2_uint(BYTE* p1, const BYTE* p2, const BYTE* mask, + const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f); +template void overlay_blend_sse2_uint(BYTE* p1, const BYTE* p2, const BYTE* mask, + const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f); +//-- +template void overlay_blend_sse2_uint(BYTE* p1, const BYTE* p2, const BYTE* mask, + const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f); +template void overlay_blend_sse2_uint(BYTE* p1, const BYTE* p2, const BYTE* mask, + const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f); +template void overlay_blend_sse2_uint(BYTE* p1, const BYTE* p2, const BYTE* mask, + const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f); +template void overlay_blend_sse2_uint(BYTE* p1, const BYTE* p2, const BYTE* mask, + const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f); +template void overlay_blend_sse2_uint(BYTE* p1, const BYTE* p2, const BYTE* mask, + const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f); + + + +template +void overlay_blend_sse2_float(BYTE* p1, const BYTE* p2, const BYTE* mask, const int p1_pitch, const int p2_pitch, const int mask_pitch, - const int width, const int height) { + const int width, const int height, const int /*opacity*/, const float opacity_f) +{ + const int realwidth = width * sizeof(float); int wMod16 = (realwidth / 16) * 16; + auto opacity_v = _mm_set1_ps(opacity_f); for (int y = 0; y < height; y++) { for (int x = 0; x < wMod16; x += 16) { - __m128i p1_f = _mm_loadu_si128(reinterpret_cast(p1 + x)); - __m128i p2_f = _mm_loadu_si128(reinterpret_cast(p2 + x)); - __m128i mask_f = _mm_loadu_si128(reinterpret_cast(mask + x)); - - __m128i result = overlay_blend_sse2_float_core(p1_f, p2_f, mask_f); + auto p1_f = _mm_loadu_ps(reinterpret_cast(p1 + x)); + auto p2_f = _mm_loadu_ps(reinterpret_cast(p2 + x)); + __m128 new_mask; + if constexpr (has_mask) { + new_mask = _mm_loadu_ps(reinterpret_cast(mask + x)); + new_mask = _mm_mul_ps(new_mask, opacity_v); + } + else { + new_mask = opacity_v; + } + auto result = _mm_add_ps(p1_f, _mm_mul_ps(_mm_sub_ps(p2_f, p1_f), new_mask)); // p1*(1-mask) + p2*mask = p1+(p2-p1)*mask - _mm_storeu_si128(reinterpret_cast<__m128i*>(p1 + x), result); + _mm_storeu_ps(reinterpret_cast(p1 + x), result); } // Leftover value for (int x = wMod16 / sizeof(float); x < width; x++) { - float result = overlay_blend_c_core_f(reinterpret_cast(p1)[x], reinterpret_cast(p2)[x], reinterpret_cast(mask)[x]); - reinterpret_cast(p1)[x] = result; + auto new_mask = has_mask ? reinterpret_cast(mask)[x] * opacity_f : opacity_f; + auto p1x = reinterpret_cast(p1)[x]; + auto p2x = reinterpret_cast(p2)[x]; + auto result = p1x + (p2x - p1x) * new_mask; // p1x*(1-new_mask) + p2x*mask + reinterpret_cast(p1)[x] = result; } p1 += p1_pitch; p2 += p2_pitch; - mask += mask_pitch; + if constexpr (has_mask) + mask += mask_pitch; } } // instantiate -template void overlay_blend_sse41_plane_masked(BYTE *p1, const BYTE *p2, const BYTE *mask, const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height); -// 16 bit: SSE4 only -template void overlay_blend_sse41_plane_masked(BYTE *p1, const BYTE *p2, const BYTE *mask, const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height); -template void overlay_blend_sse41_plane_masked(BYTE *p1, const BYTE *p2, const BYTE *mask, const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height); -template void overlay_blend_sse41_plane_masked(BYTE *p1, const BYTE *p2, const BYTE *mask, const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height); -template void overlay_blend_sse41_plane_masked(BYTE *p1, const BYTE *p2, const BYTE *mask, const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height); +template void overlay_blend_sse2_float(BYTE* p1, const BYTE* p2, const BYTE* mask, + const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int /*opacity*/, const float opacity_f); +template void overlay_blend_sse2_float(BYTE* p1, const BYTE* p2, const BYTE* mask, + const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int /*opacity*/, const float opacity_f); @@ -487,8 +775,8 @@ void overlay_blend_mmx_plane_opacity(BYTE *p1, const BYTE *p2, } #endif -void overlay_blend_sse2_plane_opacity(BYTE *p1, const BYTE *p2, - const int p1_pitch, const int p2_pitch, +void overlay_blend_sse2_plane_opacity(BYTE *p1, const BYTE *p2, const BYTE* /*mask*/ , + const int p1_pitch, const int p2_pitch, const int /*mask_pitch*/, const int width, const int height, const int opacity, const float opacity_f) { /* const int OPACITY_SHIFT = 8; // opacity always max 0..256 @@ -559,8 +847,8 @@ template #if defined(GCC) || defined(CLANG) __attribute__((__target__("sse4.1"))) #endif -void overlay_blend_sse41_plane_opacity_uint16(BYTE *p1, const BYTE *p2, - const int p1_pitch, const int p2_pitch, +void overlay_blend_sse41_plane_opacity_uint16(BYTE *p1, const BYTE *p2, const BYTE* /*mask*/, + const int p1_pitch, const int p2_pitch, const int /*mask_pitch*/, const int width, const int height, const int opacity, const float opacity_f) { /* @@ -644,72 +932,19 @@ void overlay_blend_sse41_plane_opacity_uint16(BYTE *p1, const BYTE *p2, } } -void overlay_blend_sse2_plane_opacity_float(BYTE *p1, const BYTE *p2, - const int p1_pitch, const int p2_pitch, - const int width, const int height, const int opacity, const float opacity_f) { - /* - const int OPACITY_SHIFT = 8; // opacity always max 0..256 - const int MASK_CORR_SHIFT = OPACITY_SHIFT; // no mask, mask = opacity, 8 bits always - const int half_pixel_value_rounding = (1 << (MASK_CORR_SHIFT - 1)); - - // avoid "uint16*uint16 can't get into int32" overflows - // no need here, opacity as mask is always 8 bit - // typedef std::conditional < sizeof(pixel_t) == 1, int, typename std::conditional < sizeof(pixel_t) == 2, int64_t, float>::type >::type result_t; - - for (int y = 0; y < height; y++) { - for (int x = 0; x < width; x++) { - pixel_t p1x = reinterpret_cast(p1)[x]; - pixel_t p2x = reinterpret_cast(p2)[x]; - pixel_t result = (pixel_t)((((p1x << MASK_CORR_SHIFT) | half_pixel_value_rounding) + (p2x-p1x)*opacity) >> MASK_CORR_SHIFT); - //BYTE result = overlay_blend_c_core_8(p1[x], p2[x], opacity); - reinterpret_cast(p1)[x] = result; - } - */ - AVS_UNUSED(opacity); - - __m128i mask; - mask = _mm_castps_si128(_mm_set1_ps(opacity_f)); - const int realwidth = width * sizeof(float); - - int wMod16 = (realwidth / 16) * 16; - - for (int y = 0; y < height; y++) { - for (int x = 0; x < wMod16; x += 16) { - __m128i p1_f, p2_f; - - p1_f = _mm_loadu_si128(reinterpret_cast(p1 + x)); - p2_f = _mm_loadu_si128(reinterpret_cast(p2 + x)); - - __m128i result; - // sizeof(pixel_t) == 4, float - result = overlay_blend_sse2_float_core(p1_f, p2_f, mask); - - _mm_storeu_si128(reinterpret_cast<__m128i*>(p1 + x), result); - } - - // Leftover value - for (int x = wMod16 / sizeof(float); x < width; x++) { - float result = overlay_blend_c_core_f(reinterpret_cast(p1)[x], reinterpret_cast(p2)[x], opacity_f); - reinterpret_cast(p1)[x] = result; - } - - p1 += p1_pitch; - p2 += p2_pitch; - } -} // instantiate -template void overlay_blend_sse41_plane_opacity_uint16<10>(BYTE *p1, const BYTE *p2, - const int p1_pitch, const int p2_pitch, +template void overlay_blend_sse41_plane_opacity_uint16<10>(BYTE *p1, const BYTE *p2, const BYTE* mask, + const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f); -template void overlay_blend_sse41_plane_opacity_uint16<12>(BYTE *p1, const BYTE *p2, - const int p1_pitch, const int p2_pitch, +template void overlay_blend_sse41_plane_opacity_uint16<12>(BYTE *p1, const BYTE *p2, const BYTE* mask, + const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f); -template void overlay_blend_sse41_plane_opacity_uint16<14>(BYTE *p1, const BYTE *p2, - const int p1_pitch, const int p2_pitch, +template void overlay_blend_sse41_plane_opacity_uint16<14>(BYTE *p1, const BYTE *p2, const BYTE* mask, + const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f); -template void overlay_blend_sse41_plane_opacity_uint16<16>(BYTE *p1, const BYTE *p2, - const int p1_pitch, const int p2_pitch, +template void overlay_blend_sse41_plane_opacity_uint16<16>(BYTE *p1, const BYTE *p2, const BYTE* mask, + const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f); @@ -954,47 +1189,6 @@ void overlay_blend_sse41_plane_masked_opacity(BYTE *p1, const BYTE *p2, const BY } } -void overlay_blend_sse2_plane_masked_opacity_float(BYTE *p1, const BYTE *p2, const BYTE *mask, - const int p1_pitch, const int p2_pitch, const int mask_pitch, - const int width, const int height, const int opacity, const float opacity_f) { - - AVS_UNUSED(opacity_f); - - __m128i opacity_mask = _mm_castps_si128(_mm_set1_ps(opacity_f)); - const int realwidth = width * sizeof(float); - - int wMod16 = (realwidth / 16) * 16; - - for (int y = 0; y < height; y++) { - for (int x = 0; x < wMod16; x += 16) { - __m128i p1_f, p2_f; - __m128i mask_f; - - p1_f = _mm_loadu_si128(reinterpret_cast(p1 + x)); - p2_f = _mm_loadu_si128(reinterpret_cast(p2 + x)); - mask_f = _mm_loadu_si128(reinterpret_cast(mask + x)); - - mask_f = overlay_merge_mask_sse2_float(mask_f, opacity_mask); - __m128i result = overlay_blend_sse2_float_core(p1_f, p2_f, mask_f); - - _mm_storeu_si128(reinterpret_cast<__m128i*>(p1 + x), result); - } - - // Leftover value - for (int x = wMod16 / sizeof(float); x < width; x++) { - float new_mask = (reinterpret_cast(mask)[x] * opacity_f); - float p1x = reinterpret_cast(p1)[x]; - float p2x = reinterpret_cast(p2)[x]; - - float result = p1x + (p2x - p1x)*new_mask; - reinterpret_cast(p1)[x] = result; - } - - p1 += p1_pitch; - p2 += p2_pitch; - mask += mask_pitch; - } -} // instantiate template void overlay_blend_sse41_plane_masked_opacity(BYTE *p1, const BYTE *p2, const BYTE *mask, const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f); diff --git a/avs_core/filters/overlay/intel/blend_common_sse.h b/avs_core/filters/overlay/intel/blend_common_sse.h index 2b181f53d4..9952f737f7 100644 --- a/avs_core/filters/overlay/intel/blend_common_sse.h +++ b/avs_core/filters/overlay/intel/blend_common_sse.h @@ -43,9 +43,11 @@ #ifdef X86_32 void overlay_blend_mmx_plane_masked(BYTE* p1, const BYTE* p2, const BYTE* mask, const int p1_pitch, const int p2_pitch, const int mask_pitch, - const int width, const int height); + const int width, const int height, const int opacity, const float opacity_f); #endif -void overlay_blend_sse2_plane_masked(BYTE* p1, const BYTE* p2, const BYTE* mask, const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height); +void overlay_blend_sse2_plane_masked(BYTE* p1, const BYTE* p2, const BYTE* mask, + const int p1_pitch, const int p2_pitch, const int mask_pitch, + const int width, const int height, const int opacity, const float opacity_f); template #if defined(GCC) || defined(CLANG) @@ -53,31 +55,46 @@ __attribute__((__target__("sse4.1"))) #endif void overlay_blend_sse41_plane_masked(BYTE* p1, const BYTE* p2, const BYTE* mask, const int p1_pitch, const int p2_pitch, const int mask_pitch, - const int width, const int height); + const int width, const int height, const int opacity, const float opacity_f); + +template +#if defined(GCC) || defined(CLANG) +__attribute__((__target__("sse4.1"))) +#endif +void overlay_blend_sse41_uint(BYTE* p1, const BYTE* p2, const BYTE* mask, + const int p1_pitch, const int p2_pitch, const int mask_pitch, + const int width, const int height, const int opacity, const float opacity_f); + +template +#if defined(GCC) || defined(CLANG) +__attribute__((__target__("sse2"))) +#endif +void overlay_blend_sse2_uint(BYTE* p1, const BYTE* p2, const BYTE* mask, + const int p1_pitch, const int p2_pitch, const int mask_pitch, + const int width, const int height, const int opacity, const float opacity_f); -void overlay_blend_sse2_plane_masked_float(BYTE* p1, const BYTE* p2, const BYTE* mask, const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height); +template +void overlay_blend_sse2_float(BYTE* p1, const BYTE* p2, const BYTE* mask, + const int p1_pitch, const int p2_pitch, const int mask_pitch, + const int width, const int height, const int opacity, const float opacity_f); #ifdef X86_32 -void overlay_blend_mmx_plane_opacity(BYTE* p1, const BYTE* p2, - const int p1_pitch, const int p2_pitch, +void overlay_blend_mmx_plane_opacity(BYTE* p1, const BYTE* p2, const BYTE* mask, + const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f); #endif -void overlay_blend_sse2_plane_opacity(BYTE* p1, const BYTE* p2, - const int p1_pitch, const int p2_pitch, +void overlay_blend_sse2_plane_opacity(BYTE* p1, const BYTE* p2, const BYTE* mask, + const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f); template #if defined(GCC) || defined(CLANG) __attribute__((__target__("sse4.1"))) #endif -void overlay_blend_sse41_plane_opacity_uint16(BYTE* p1, const BYTE* p2, - const int p1_pitch, const int p2_pitch, - const int width, const int height, const int opacity, const float opacity_f); - -void overlay_blend_sse2_plane_opacity_float(BYTE* p1, const BYTE* p2, - const int p1_pitch, const int p2_pitch, +void overlay_blend_sse41_plane_opacity_uint16(BYTE* p1, const BYTE* p2, const BYTE* mask, + const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f); @@ -87,7 +104,9 @@ void overlay_blend_mmx_plane_masked_opacity(BYTE* p1, const BYTE* p2, const BYTE const int width, const int height, const int opacity, const float opacity_f); #endif -void overlay_blend_sse2_plane_masked_opacity(BYTE* p1, const BYTE* p2, const BYTE* mask, const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f); +void overlay_blend_sse2_plane_masked_opacity(BYTE* p1, const BYTE* p2, const BYTE* mask, + const int p1_pitch, const int p2_pitch, const int mask_pitch, + const int width, const int height, const int opacity, const float opacity_f); template #if defined(GCC) || defined(CLANG) @@ -97,8 +116,6 @@ void overlay_blend_sse41_plane_masked_opacity(BYTE* p1, const BYTE* p2, const BY const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f); -void overlay_blend_sse2_plane_masked_opacity_float(BYTE* p1, const BYTE* p2, const BYTE* mask, const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f); - #ifdef X86_32 void overlay_darken_mmx(BYTE* p1Y, BYTE* p1U, BYTE* p1V, const BYTE* p2Y, const BYTE* p2U, const BYTE* p2V, int p1_pitch, int p2_pitch, int width, int height); void overlay_lighten_mmx(BYTE* p1Y, BYTE* p1U, BYTE* p1V, const BYTE* p2Y, const BYTE* p2U, const BYTE* p2V, int p1_pitch, int p2_pitch, int width, int height); diff --git a/avs_core/filters/overlay/overlay.cpp b/avs_core/filters/overlay/overlay.cpp index 451c25fd93..29623f84f3 100644 --- a/avs_core/filters/overlay/overlay.cpp +++ b/avs_core/filters/overlay/overlay.cpp @@ -719,6 +719,9 @@ void Overlay::SetOfModeByName(const char* name, IScriptEnvironment* env) { if (!lstrcmpi(name, "Blend")) { of_mode = OF_Blend; } + else if (!lstrcmpi(name, "Blend_Compat")) { + of_mode = OF_Blend_Compat; + } else if (!lstrcmpi(name, "Add")) { of_mode = OF_Add; } @@ -758,7 +761,9 @@ void Overlay::SetOfModeByName(const char* name, IScriptEnvironment* env) { OverlayFunction* Overlay::SelectFunction() { switch (of_mode) { - case OF_Blend: return new OL_BlendImage(); + case OF_Blend: + case OF_Blend_Compat: + return new OL_BlendImage(); case OF_Add: return new OL_AddImage(); case OF_Subtract: return new OL_AddImage(); // common with Add //return new OL_SubtractImage(); case OF_Multiply: return new OL_MultiplyImage(); diff --git a/avs_core/filters/overlay/overlayfunctions.h b/avs_core/filters/overlay/overlayfunctions.h index c461516d0e..2f258059c8 100644 --- a/avs_core/filters/overlay/overlayfunctions.h +++ b/avs_core/filters/overlay/overlayfunctions.h @@ -54,7 +54,8 @@ enum { OF_SoftLight, OF_HardLight, OF_Difference, - OF_Exclusion + OF_Exclusion, + OF_Blend_Compat }; class OverlayFunction {