Skip to content

Commit

Permalink
Overlay blend: rewrite. 32bit float calculation inside (#255)
Browse files Browse the repository at this point in the history
  • Loading branch information
pinterf committed Jan 29, 2022
1 parent 7b30966 commit ceae03c
Show file tree
Hide file tree
Showing 9 changed files with 935 additions and 275 deletions.
210 changes: 147 additions & 63 deletions avs_core/filters/overlay/OF_blend.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
#include "blend_common.h"
#ifdef INTEL_INTRINSICS
#include "intel/blend_common_sse.h"
#include "intel/blend_common_avx2.h"
#endif


Expand Down Expand Up @@ -72,7 +73,7 @@ void OL_BlendImage::BlendImageMask(ImageOverlayInternal* base, ImageOverlayInter
int planeindex_from = 0;
int planeindex_to = 0;

if (of_mode == OF_Blend) {
if (of_mode == OF_Blend || of_mode == OF_Blend_Compat) {
planeindex_from = 0;
planeindex_to = greyscale ? 0 : 2;
}
Expand All @@ -87,24 +88,70 @@ void OL_BlendImage::BlendImageMask(ImageOverlayInternal* base, ImageOverlayInter
planeindex_to = 2;
}

if ((opacity == 256 && pixelsize != 4) || (opacity_f == 1.0f && pixelsize == 4)) {
overlay_blend_plane_masked_t* blend_fn = nullptr;
overlay_blend_plane_masked_opacity_t* blend_fn = nullptr;

if (of_mode != OF_Blend_Compat || pixelsize == 4) {
// independent from full/not full opacity
#ifdef INTEL_INTRINSICS
if (pixelsize == 4 && (env->GetCPUFlags() & CPUF_SSE2)) {
blend_fn = overlay_blend_sse2_plane_masked_float;
if (pixelsize == 4 && (env->GetCPUFlags() & CPUF_AVX2)) {
blend_fn = overlay_blend_avx2_float<true>;
}
else if (pixelsize == 2 && (env->GetCPUFlags() & CPUF_SSE4_1)) {
else if (pixelsize == 4 && (env->GetCPUFlags() & CPUF_SSE2)) {
blend_fn = overlay_blend_sse2_float<true>;
}
else if (env->GetCPUFlags() & CPUF_AVX2) {
switch (bits_per_pixel) {
case 8: blend_fn = overlay_blend_avx2_uint<true, uint8_t, 8>; break;
case 10: blend_fn = overlay_blend_avx2_uint<true, uint16_t, 10>; break;
case 12: blend_fn = overlay_blend_avx2_uint<true, uint16_t, 12>; break;
case 14: blend_fn = overlay_blend_avx2_uint<true, uint16_t, 14>; break;
case 16: blend_fn = overlay_blend_avx2_uint<true, uint16_t, 16>; break;
}
}
else if (env->GetCPUFlags() & CPUF_SSE4_1) {
switch (bits_per_pixel) {
case 8: blend_fn = overlay_blend_sse41_uint<true, uint8_t, 8>; break;
case 10: blend_fn = overlay_blend_sse41_uint<true, uint16_t, 10>; break;
case 12: blend_fn = overlay_blend_sse41_uint<true, uint16_t, 12>; break;
case 14: blend_fn = overlay_blend_sse41_uint<true, uint16_t, 14>; break;
case 16: blend_fn = overlay_blend_sse41_uint<true, uint16_t, 16>; break;
}
}
else if (env->GetCPUFlags() & CPUF_SSE2) {
switch (bits_per_pixel) {
case 8: blend_fn = overlay_blend_sse2_uint<true, uint8_t, 8>; break;
case 10: blend_fn = overlay_blend_sse2_uint<true, uint16_t, 10>; break;
case 12: blend_fn = overlay_blend_sse2_uint<true, uint16_t, 12>; break;
case 14: blend_fn = overlay_blend_sse2_uint<true, uint16_t, 14>; break;
case 16: blend_fn = overlay_blend_sse2_uint<true, uint16_t, 16>; break;
}
}
else
#endif // INTEL_INTRINSICS
{
// pure C
switch (bits_per_pixel) {
case 8: blend_fn = overlay_blend_c_uint<true, uint8_t, 8>; break;
case 10: blend_fn = overlay_blend_c_uint<true, uint16_t, 10>; break;
case 12: blend_fn = overlay_blend_c_uint<true, uint16_t, 12>; break;
case 14: blend_fn = overlay_blend_c_uint<true, uint16_t, 14>; break;
case 16: blend_fn = overlay_blend_c_uint<true, uint16_t, 16>; break;
case 32: blend_fn = overlay_blend_c_float<true>; break;
}
}
// end of new, float precision inside masked overlays
}
else if (opacity == 256) {
// specialized functions for full opacity
#ifdef INTEL_INTRINSICS
if (pixelsize == 2 && (env->GetCPUFlags() & CPUF_SSE4_1)) {
switch (bits_per_pixel) {
case 10: blend_fn = overlay_blend_sse41_plane_masked<uint16_t, 10>; break;
case 12: blend_fn = overlay_blend_sse41_plane_masked<uint16_t, 12>; break;
case 14: blend_fn = overlay_blend_sse41_plane_masked<uint16_t, 14>; break;
case 16: blend_fn = overlay_blend_sse41_plane_masked<uint16_t, 16>; break;
}
}
else if (pixelsize == 1 && (env->GetCPUFlags() & CPUF_SSE4_1)) {
blend_fn = overlay_blend_sse41_plane_masked<uint8_t, 8>;
}
else if (pixelsize == 1 && (env->GetCPUFlags() & CPUF_SSE2)) {
blend_fn = overlay_blend_sse2_plane_masked;
}
Expand All @@ -117,34 +164,21 @@ void OL_BlendImage::BlendImageMask(ImageOverlayInternal* base, ImageOverlayInter
#endif
#endif // INTEL_INTRINSICS
{
// pure C
switch (bits_per_pixel) {
case 8: blend_fn = overlay_blend_c_plane_masked<uint8_t, 8>; break;
case 10: blend_fn = overlay_blend_c_plane_masked<uint16_t, 10>; break;
case 12: blend_fn = overlay_blend_c_plane_masked<uint16_t, 12>; break;
case 14: blend_fn = overlay_blend_c_plane_masked<uint16_t, 14>; break;
case 16: blend_fn = overlay_blend_c_plane_masked<uint16_t, 16>; break;
case 32: blend_fn = overlay_blend_c_plane_masked_f; break;
}

}

if (blend_fn == nullptr)
env->ThrowError("Blend: no valid internal function");

for (int p = planeindex_from; p <= planeindex_to; p++) {
blend_fn(base->GetPtrByIndex(p), overlay->GetPtrByIndex(p), mask->GetPtrByIndex(p),
base->GetPitchByIndex(p), overlay->GetPitchByIndex(p), mask->GetPitchByIndex(p),
(w >> base->xSubSamplingShifts[p]), h >> base->ySubSamplingShifts[p]);
}
}
else {
overlay_blend_plane_masked_opacity_t* blend_fn = nullptr;

// specialized functions for non-full opacity
#ifdef INTEL_INTRINSICS
if (pixelsize == 4 && (env->GetCPUFlags() & CPUF_SSE2)) {
blend_fn = overlay_blend_sse2_plane_masked_opacity_float;
}
else if (pixelsize == 2 && (env->GetCPUFlags() & CPUF_SSE4_1)) {
if (pixelsize == 2 && (env->GetCPUFlags() & CPUF_SSE4_1)) {
switch (bits_per_pixel)
{
case 10: blend_fn = overlay_blend_sse41_plane_masked_opacity<uint16_t, 10>; break;
Expand All @@ -153,9 +187,6 @@ void OL_BlendImage::BlendImageMask(ImageOverlayInternal* base, ImageOverlayInter
case 16: blend_fn = overlay_blend_sse41_plane_masked_opacity<uint16_t, 16>; break;
}
}
else if (pixelsize == 1 && (env->GetCPUFlags() & CPUF_SSE4_1)) {
blend_fn = overlay_blend_sse41_plane_masked_opacity<uint8_t, 8>;
}
else if (pixelsize == 1 && (env->GetCPUFlags() & CPUF_SSE2)) {
blend_fn = overlay_blend_sse2_plane_masked_opacity;
}
Expand All @@ -174,18 +205,17 @@ void OL_BlendImage::BlendImageMask(ImageOverlayInternal* base, ImageOverlayInter
case 12:blend_fn = overlay_blend_c_plane_masked_opacity<uint16_t, 12>; break;
case 14:blend_fn = overlay_blend_c_plane_masked_opacity<uint16_t, 14>; break;
case 16:blend_fn = overlay_blend_c_plane_masked_opacity<uint16_t, 16>; break;
case 32: blend_fn = overlay_blend_c_plane_masked_opacity_f; break;
}
}
}

if (blend_fn == nullptr)
env->ThrowError("Blend: no valid internal function");
if (blend_fn == nullptr)
env->ThrowError("Blend: no valid internal function");

for (int p = planeindex_from; p <= planeindex_to; p++) {
blend_fn(base->GetPtrByIndex(p), overlay->GetPtrByIndex(p), mask->GetPtrByIndex(p),
base->GetPitchByIndex(p), overlay->GetPitchByIndex(p), mask->GetPitchByIndex(p),
(w >> base->xSubSamplingShifts[p]), h >> base->ySubSamplingShifts[p], opacity, opacity_f);
}
for (int p = planeindex_from; p <= planeindex_to; p++) {
blend_fn(base->GetPtrByIndex(p), overlay->GetPtrByIndex(p), mask->GetPtrByIndex(p),
base->GetPitchByIndex(p), overlay->GetPitchByIndex(p), mask->GetPitchByIndex(p),
(w >> base->xSubSamplingShifts[p]), h >> base->ySubSamplingShifts[p], opacity, opacity_f);
}
}

Expand All @@ -199,7 +229,7 @@ void OL_BlendImage::BlendImage(ImageOverlayInternal* base, ImageOverlayInternal*
int planeindex_from = 0;
int planeindex_to = 0;

if (of_mode == OF_Blend) {
if (of_mode == OF_Blend || of_mode == OF_Blend_Compat) {
planeindex_from = 0;
planeindex_to = greyscale ? 0 : 2;
}
Expand All @@ -220,46 +250,100 @@ void OL_BlendImage::BlendImage(ImageOverlayInternal* base, ImageOverlayInternal*
}
}
else {
overlay_blend_plane_opacity_t* blend_fn = nullptr;
overlay_blend_plane_masked_opacity_t* blend_fn = nullptr;

if (of_mode != OF_Blend_Compat || pixelsize == 4) {
#ifdef INTEL_INTRINSICS
if (pixelsize == 4 && (env->GetCPUFlags() & CPUF_SSE2)) {
blend_fn = overlay_blend_sse2_plane_opacity_float;
}
else if (pixelsize == 2 && (env->GetCPUFlags() & CPUF_SSE4_1)) {
switch (bits_per_pixel) {
case 10: blend_fn = overlay_blend_sse41_plane_opacity_uint16<10>; break;
case 12: blend_fn = overlay_blend_sse41_plane_opacity_uint16<12>; break;
case 14: blend_fn = overlay_blend_sse41_plane_opacity_uint16<14>; break;
case 16: blend_fn = overlay_blend_sse41_plane_opacity_uint16<16>; break;
if (pixelsize == 4 && (env->GetCPUFlags() & CPUF_AVX2)) {
blend_fn = overlay_blend_avx2_float<false>;
}
}
else if (pixelsize == 1 && (env->GetCPUFlags() & CPUF_SSE2)) {
blend_fn = overlay_blend_sse2_plane_opacity;
}
else
#ifdef X86_32
if (pixelsize == 1 && (env->GetCPUFlags() & CPUF_MMX)) {
blend_fn = overlay_blend_mmx_plane_opacity;
else if (pixelsize == 4 && (env->GetCPUFlags() & CPUF_SSE2)) {
blend_fn = overlay_blend_sse2_float<false>;
}
else if (env->GetCPUFlags() & CPUF_AVX2) {
switch (bits_per_pixel) {
case 8: blend_fn = overlay_blend_avx2_uint<false, uint8_t, 8>; break;
case 10: blend_fn = overlay_blend_avx2_uint<false, uint16_t, 10>; break;
case 12: blend_fn = overlay_blend_avx2_uint<false, uint16_t, 12>; break;
case 14: blend_fn = overlay_blend_avx2_uint<false, uint16_t, 14>; break;
case 16: blend_fn = overlay_blend_avx2_uint<false, uint16_t, 16>; break;
}
}
else if (env->GetCPUFlags() & CPUF_SSE4_1) {
switch (bits_per_pixel) {
case 8: blend_fn = overlay_blend_sse41_uint<false, uint8_t, 8>; break;
case 10: blend_fn = overlay_blend_sse41_uint<false, uint16_t, 10>; break;
case 12: blend_fn = overlay_blend_sse41_uint<false, uint16_t, 12>; break;
case 14: blend_fn = overlay_blend_sse41_uint<false, uint16_t, 14>; break;
case 16: blend_fn = overlay_blend_sse41_uint<false, uint16_t, 16>; break;
}
}
else if (env->GetCPUFlags() & CPUF_SSE2) {
switch (bits_per_pixel) {
case 8: blend_fn = overlay_blend_sse2_uint<false, uint8_t, 8>; break;
case 10: blend_fn = overlay_blend_sse2_uint<false, uint16_t, 10>; break;
case 12: blend_fn = overlay_blend_sse2_uint<false, uint16_t, 12>; break;
case 14: blend_fn = overlay_blend_sse2_uint<false, uint16_t, 14>; break;
case 16: blend_fn = overlay_blend_sse2_uint<false, uint16_t, 16>; break;
}
}
else
#endif
#endif // INTEL_INTRINSICS
{
// pure C
switch (bits_per_pixel) {
case 8: blend_fn = overlay_blend_c_plane_opacity<uint8_t, 8>; break;
case 10: blend_fn = overlay_blend_c_plane_opacity<uint16_t, 10>; break;
case 12: blend_fn = overlay_blend_c_plane_opacity<uint16_t, 12>; break;
case 14: blend_fn = overlay_blend_c_plane_opacity<uint16_t, 14>; break;
case 16: blend_fn = overlay_blend_c_plane_opacity<uint16_t, 16>; break;
case 32: blend_fn = overlay_blend_c_plane_opacity_f; break;
case 8: blend_fn = overlay_blend_c_uint<false, uint8_t, 8>; break;
case 10: blend_fn = overlay_blend_c_uint<false, uint16_t, 10>; break;
case 12: blend_fn = overlay_blend_c_uint<false, uint16_t, 12>; break;
case 14: blend_fn = overlay_blend_c_uint<false, uint16_t, 14>; break;
case 16: blend_fn = overlay_blend_c_uint<false, uint16_t, 16>; break;
case 32: blend_fn = overlay_blend_c_float<false>; break;
}
}
// end of new, float precision inside masked overlays
}
else {
// old routies
#ifdef INTEL_INTRINSICS
if (pixelsize == 2 && (env->GetCPUFlags() & CPUF_SSE4_1)) {
switch (bits_per_pixel) {
case 10: blend_fn = overlay_blend_sse41_plane_opacity_uint16<10>; break;
case 12: blend_fn = overlay_blend_sse41_plane_opacity_uint16<12>; break;
case 14: blend_fn = overlay_blend_sse41_plane_opacity_uint16<14>; break;
case 16: blend_fn = overlay_blend_sse41_plane_opacity_uint16<16>; break;
}
}
else if (pixelsize == 1 && (env->GetCPUFlags() & CPUF_SSE2)) {
blend_fn = overlay_blend_sse2_plane_opacity;
}
else
#ifdef X86_32
if (pixelsize == 1 && (env->GetCPUFlags() & CPUF_MMX)) {
blend_fn = overlay_blend_mmx_plane_opacity;
}
else
#endif
#endif // INTEL_INTRINSICS
{
switch (bits_per_pixel) {
case 8: blend_fn = overlay_blend_c_plane_opacity<uint8_t, 8>; break;
case 10: blend_fn = overlay_blend_c_plane_opacity<uint16_t, 10>; break;
case 12: blend_fn = overlay_blend_c_plane_opacity<uint16_t, 12>; break;
case 14: blend_fn = overlay_blend_c_plane_opacity<uint16_t, 14>; break;
case 16: blend_fn = overlay_blend_c_plane_opacity<uint16_t, 16>; break;
}
}
}

if (blend_fn == nullptr)
env->ThrowError("Blend: no valid internal function");

for (int p = planeindex_from; p <= planeindex_to; p++) {
blend_fn(base->GetPtrByIndex(p), overlay->GetPtrByIndex(p), base->GetPitchByIndex(p), overlay->GetPitchByIndex(p), (w >> base->xSubSamplingShifts[p]), h >> base->ySubSamplingShifts[p], opacity, opacity_f);
// no mask ptr
blend_fn(
base->GetPtrByIndex(p), overlay->GetPtrByIndex(p), nullptr,
base->GetPitchByIndex(p), overlay->GetPitchByIndex(p), 0,
(w >> base->xSubSamplingShifts[p]), h >> base->ySubSamplingShifts[p], opacity, opacity_f);
}

}
Expand Down
Loading

0 comments on commit ceae03c

Please sign in to comment.