From ceae03c17b4095a7612dbbc7be864ca9ff0f872f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ferenc=20Pint=C3=A9r?= <pinterfer@gmail.com>
Date: Sat, 29 Jan 2022 10:10:05 +0100
Subject: [PATCH] Overlay blend: rewrite. 32bit float calculation inside (#255)

---
 avs_core/filters/overlay/OF_blend.cpp         | 210 +++++---
 avs_core/filters/overlay/blend_common.cpp     | 163 ++++--
 avs_core/filters/overlay/blend_common.h       |  36 +-
 .../overlay/intel/blend_common_avx2.cpp       | 230 +++++++++
 .../filters/overlay/intel/blend_common_avx2.h |  40 ++
 .../overlay/intel/blend_common_sse.cpp        | 470 +++++++++++++-----
 .../filters/overlay/intel/blend_common_sse.h  |  51 +-
 avs_core/filters/overlay/overlay.cpp          |   7 +-
 avs_core/filters/overlay/overlayfunctions.h   |   3 +-
 9 files changed, 935 insertions(+), 275 deletions(-)
 create mode 100644 avs_core/filters/overlay/intel/blend_common_avx2.cpp
 create mode 100644 avs_core/filters/overlay/intel/blend_common_avx2.h
diff --git a/avs_core/filters/overlay/OF_blend.cpp b/avs_core/filters/overlay/OF_blend.cpp
index db157c7fe5..e469168394 100644
--- a/avs_core/filters/overlay/OF_blend.cpp
+++ b/avs_core/filters/overlay/OF_blend.cpp
@@ -38,6 +38,7 @@
 #include "blend_common.h"
 #ifdef INTEL_INTRINSICS
 #include "intel/blend_common_sse.h"
+#include "intel/blend_common_avx2.h"
 #endif
 
 
@@ -72,7 +73,7 @@ void OL_BlendImage::BlendImageMask(ImageOverlayInternal* base, ImageOverlayInter
   int planeindex_from = 0;
   int planeindex_to = 0;
 
-  if (of_mode == OF_Blend) {
+  if (of_mode == OF_Blend || of_mode == OF_Blend_Compat) {
     planeindex_from = 0;
     planeindex_to = greyscale ? 0 : 2;
   }
@@ -87,14 +88,63 @@ void OL_BlendImage::BlendImageMask(ImageOverlayInternal* base, ImageOverlayInter
     planeindex_to = 2;
   }
 
-  if ((opacity == 256 && pixelsize != 4) || (opacity_f == 1.0f && pixelsize == 4)) {
-    overlay_blend_plane_masked_t* blend_fn = nullptr;
+  overlay_blend_plane_masked_opacity_t* blend_fn = nullptr;
 
+  if (of_mode != OF_Blend_Compat || pixelsize == 4) {
+    // independent from full/not full opacity
 #ifdef INTEL_INTRINSICS
-    if (pixelsize == 4 && (env->GetCPUFlags() & CPUF_SSE2)) {
-      blend_fn = overlay_blend_sse2_plane_masked_float;
+    if (pixelsize == 4 && (env->GetCPUFlags() & CPUF_AVX2)) {
+      blend_fn = overlay_blend_avx2_float<true>;
     }
-    else if (pixelsize == 2 && (env->GetCPUFlags() & CPUF_SSE4_1)) {
+    else if (pixelsize == 4 && (env->GetCPUFlags() & CPUF_SSE2)) {
+      blend_fn = overlay_blend_sse2_float<true>;
+    }
+    else if (env->GetCPUFlags() & CPUF_AVX2) {
+      switch (bits_per_pixel) {
+      case 8: blend_fn = overlay_blend_avx2_uint<true, uint8_t, 8>; break;
+      case 10: blend_fn = overlay_blend_avx2_uint<true, uint16_t, 10>; break;
+      case 12: blend_fn = overlay_blend_avx2_uint<true, uint16_t, 12>; break;
+      case 14: blend_fn = overlay_blend_avx2_uint<true, uint16_t, 14>; break;
+      case 16: blend_fn = overlay_blend_avx2_uint<true, uint16_t, 16>; break;
+      }
+    }
+    else if (env->GetCPUFlags() & CPUF_SSE4_1) {
+      switch (bits_per_pixel) {
+      case 8: blend_fn = overlay_blend_sse41_uint<true, uint8_t, 8>; break;
+      case 10: blend_fn = overlay_blend_sse41_uint<true, uint16_t, 10>; break;
+      case 12: blend_fn = overlay_blend_sse41_uint<true, uint16_t, 12>; break;
+      case 14: blend_fn = overlay_blend_sse41_uint<true, uint16_t, 14>; break;
+      case 16: blend_fn = overlay_blend_sse41_uint<true, uint16_t, 16>; break;
+      }
+    }
+    else if (env->GetCPUFlags() & CPUF_SSE2) {
+      switch (bits_per_pixel) {
+      case 8: blend_fn = overlay_blend_sse2_uint<true, uint8_t, 8>; break;
+      case 10: blend_fn = overlay_blend_sse2_uint<true, uint16_t, 10>; break;
+      case 12: blend_fn = overlay_blend_sse2_uint<true, uint16_t, 12>; break;
+      case 14: blend_fn = overlay_blend_sse2_uint<true, uint16_t, 14>; break;
+      case 16: blend_fn = overlay_blend_sse2_uint<true, uint16_t, 16>; break;
+      }
+    }
+    else
+#endif // INTEL_INTRINSICS
+    {
+      // pure C
+      switch (bits_per_pixel) {
+      case 8: blend_fn = overlay_blend_c_uint<true, uint8_t, 8>; break;
+      case 10: blend_fn = overlay_blend_c_uint<true, uint16_t, 10>; break;
+      case 12: blend_fn = overlay_blend_c_uint<true, uint16_t, 12>; break;
+      case 14: blend_fn = overlay_blend_c_uint<true, uint16_t, 14>; break;
+      case 16: blend_fn = overlay_blend_c_uint<true, uint16_t, 16>; break;
+      case 32: blend_fn = overlay_blend_c_float<true>; break;
+      }
+    }
+    // end of new, float precision inside masked overlays
+  }
+  else if (opacity == 256) {
+    // specialized functions for full opacity
+#ifdef INTEL_INTRINSICS
+    if (pixelsize == 2 && (env->GetCPUFlags() & CPUF_SSE4_1)) {
       switch (bits_per_pixel) {
       case 10: blend_fn = overlay_blend_sse41_plane_masked<uint16_t, 10>; break;
       case 12: blend_fn = overlay_blend_sse41_plane_masked<uint16_t, 12>; break;
@@ -102,9 +152,6 @@ void OL_BlendImage::BlendImageMask(ImageOverlayInternal* base, ImageOverlayInter
       case 16: blend_fn = overlay_blend_sse41_plane_masked<uint16_t, 16>; break;
       }
     }
-    else if (pixelsize == 1 && (env->GetCPUFlags() & CPUF_SSE4_1)) {
-      blend_fn = overlay_blend_sse41_plane_masked<uint8_t, 8>;
-    }
     else if (pixelsize == 1 && (env->GetCPUFlags() & CPUF_SSE2)) {
       blend_fn = overlay_blend_sse2_plane_masked;
     }
@@ -117,34 +164,21 @@ void OL_BlendImage::BlendImageMask(ImageOverlayInternal* base, ImageOverlayInter
 #endif
 #endif // INTEL_INTRINSICS
       {
+        // pure C
         switch (bits_per_pixel) {
         case 8: blend_fn = overlay_blend_c_plane_masked<uint8_t, 8>; break;
         case 10: blend_fn = overlay_blend_c_plane_masked<uint16_t, 10>; break;
         case 12: blend_fn = overlay_blend_c_plane_masked<uint16_t, 12>; break;
         case 14: blend_fn = overlay_blend_c_plane_masked<uint16_t, 14>; break;
         case 16: blend_fn = overlay_blend_c_plane_masked<uint16_t, 16>; break;
-        case 32: blend_fn = overlay_blend_c_plane_masked_f; break;
         }
-
       }
 
-    if (blend_fn == nullptr)
-      env->ThrowError("Blend: no valid internal function");
-
-    for (int p = planeindex_from; p <= planeindex_to; p++) {
-      blend_fn(base->GetPtrByIndex(p), overlay->GetPtrByIndex(p), mask->GetPtrByIndex(p),
-        base->GetPitchByIndex(p), overlay->GetPitchByIndex(p), mask->GetPitchByIndex(p),
-        (w >> base->xSubSamplingShifts[p]), h >> base->ySubSamplingShifts[p]);
-    }
   }
   else {
-    overlay_blend_plane_masked_opacity_t* blend_fn = nullptr;
-
+    // specialized functions for non-full opacity
 #ifdef INTEL_INTRINSICS
-    if (pixelsize == 4 && (env->GetCPUFlags() & CPUF_SSE2)) {
-      blend_fn = overlay_blend_sse2_plane_masked_opacity_float;
-    }
-    else if (pixelsize == 2 && (env->GetCPUFlags() & CPUF_SSE4_1)) {
+    if (pixelsize == 2 && (env->GetCPUFlags() & CPUF_SSE4_1)) {
       switch (bits_per_pixel)
       {
       case 10: blend_fn = overlay_blend_sse41_plane_masked_opacity<uint16_t, 10>; break;
@@ -153,9 +187,6 @@ void OL_BlendImage::BlendImageMask(ImageOverlayInternal* base, ImageOverlayInter
       case 16: blend_fn = overlay_blend_sse41_plane_masked_opacity<uint16_t, 16>; break;
       }
     }
-    else if (pixelsize == 1 && (env->GetCPUFlags() & CPUF_SSE4_1)) {
-      blend_fn = overlay_blend_sse41_plane_masked_opacity<uint8_t, 8>;
-    }
     else if (pixelsize == 1 && (env->GetCPUFlags() & CPUF_SSE2)) {
       blend_fn = overlay_blend_sse2_plane_masked_opacity;
     }
@@ -174,18 +205,17 @@ void OL_BlendImage::BlendImageMask(ImageOverlayInternal* base, ImageOverlayInter
         case 12:blend_fn = overlay_blend_c_plane_masked_opacity<uint16_t, 12>; break;
         case 14:blend_fn = overlay_blend_c_plane_masked_opacity<uint16_t, 14>; break;
         case 16:blend_fn = overlay_blend_c_plane_masked_opacity<uint16_t, 16>; break;
-        case 32: blend_fn = overlay_blend_c_plane_masked_opacity_f; break;
         }
       }
+  }
 
-    if (blend_fn == nullptr)
-      env->ThrowError("Blend: no valid internal function");
+  if (blend_fn == nullptr)
+    env->ThrowError("Blend: no valid internal function");
 
-    for (int p = planeindex_from; p <= planeindex_to; p++) {
-      blend_fn(base->GetPtrByIndex(p), overlay->GetPtrByIndex(p), mask->GetPtrByIndex(p),
-        base->GetPitchByIndex(p), overlay->GetPitchByIndex(p), mask->GetPitchByIndex(p),
-        (w >> base->xSubSamplingShifts[p]), h >> base->ySubSamplingShifts[p], opacity, opacity_f);
-    }
+  for (int p = planeindex_from; p <= planeindex_to; p++) {
+    blend_fn(base->GetPtrByIndex(p), overlay->GetPtrByIndex(p), mask->GetPtrByIndex(p),
+      base->GetPitchByIndex(p), overlay->GetPitchByIndex(p), mask->GetPitchByIndex(p),
+      (w >> base->xSubSamplingShifts[p]), h >> base->ySubSamplingShifts[p], opacity, opacity_f);
   }
 }
 
@@ -199,7 +229,7 @@ void OL_BlendImage::BlendImage(ImageOverlayInternal* base, ImageOverlayInternal*
   int planeindex_from = 0;
   int planeindex_to = 0;
 
-  if (of_mode == OF_Blend) {
+  if (of_mode == OF_Blend || of_mode == OF_Blend_Compat) {
     planeindex_from = 0;
     planeindex_to = greyscale ? 0 : 2;
   }
@@ -220,46 +250,100 @@ void OL_BlendImage::BlendImage(ImageOverlayInternal* base, ImageOverlayInternal*
     }
   }
   else {
-    overlay_blend_plane_opacity_t* blend_fn = nullptr;
+    overlay_blend_plane_masked_opacity_t* blend_fn = nullptr;
+
+    if (of_mode != OF_Blend_Compat || pixelsize == 4) {
 #ifdef INTEL_INTRINSICS
-    if (pixelsize == 4 && (env->GetCPUFlags() & CPUF_SSE2)) {
-      blend_fn = overlay_blend_sse2_plane_opacity_float;
-    }
-    else if (pixelsize == 2 && (env->GetCPUFlags() & CPUF_SSE4_1)) {
-      switch (bits_per_pixel) {
-      case 10: blend_fn = overlay_blend_sse41_plane_opacity_uint16<10>; break;
-      case 12: blend_fn = overlay_blend_sse41_plane_opacity_uint16<12>; break;
-      case 14: blend_fn = overlay_blend_sse41_plane_opacity_uint16<14>; break;
-      case 16: blend_fn = overlay_blend_sse41_plane_opacity_uint16<16>; break;
+      if (pixelsize == 4 && (env->GetCPUFlags() & CPUF_AVX2)) {
+        blend_fn = overlay_blend_avx2_float<false>;
       }
-    }
-    else if (pixelsize == 1 && (env->GetCPUFlags() & CPUF_SSE2)) {
-      blend_fn = overlay_blend_sse2_plane_opacity;
-    }
-    else
-#ifdef X86_32
-      if (pixelsize == 1 && (env->GetCPUFlags() & CPUF_MMX)) {
-        blend_fn = overlay_blend_mmx_plane_opacity;
+      else if (pixelsize == 4 && (env->GetCPUFlags() & CPUF_SSE2)) {
+        blend_fn = overlay_blend_sse2_float<false>;
+      }
+      else if (env->GetCPUFlags() & CPUF_AVX2) {
+        switch (bits_per_pixel) {
+        case 8: blend_fn = overlay_blend_avx2_uint<false, uint8_t, 8>; break;
+        case 10: blend_fn = overlay_blend_avx2_uint<false, uint16_t, 10>; break;
+        case 12: blend_fn = overlay_blend_avx2_uint<false, uint16_t, 12>; break;
+        case 14: blend_fn = overlay_blend_avx2_uint<false, uint16_t, 14>; break;
+        case 16: blend_fn = overlay_blend_avx2_uint<false, uint16_t, 16>; break;
+        }
+      }
+      else if (env->GetCPUFlags() & CPUF_SSE4_1) {
+        switch (bits_per_pixel) {
+        case 8: blend_fn = overlay_blend_sse41_uint<false, uint8_t, 8>; break;
+        case 10: blend_fn = overlay_blend_sse41_uint<false, uint16_t, 10>; break;
+        case 12: blend_fn = overlay_blend_sse41_uint<false, uint16_t, 12>; break;
+        case 14: blend_fn = overlay_blend_sse41_uint<false, uint16_t, 14>; break;
+        case 16: blend_fn = overlay_blend_sse41_uint<false, uint16_t, 16>; break;
+        }
+      }
+      else if (env->GetCPUFlags() & CPUF_SSE2) {
+        switch (bits_per_pixel) {
+        case 8: blend_fn = overlay_blend_sse2_uint<false, uint8_t, 8>; break;
+        case 10: blend_fn = overlay_blend_sse2_uint<false, uint16_t, 10>; break;
+        case 12: blend_fn = overlay_blend_sse2_uint<false, uint16_t, 12>; break;
+        case 14: blend_fn = overlay_blend_sse2_uint<false, uint16_t, 14>; break;
+        case 16: blend_fn = overlay_blend_sse2_uint<false, uint16_t, 16>; break;
+        }
       }
       else
-#endif
 #endif // INTEL_INTRINSICS
       {
+        // pure C
         switch (bits_per_pixel) {
-        case 8: blend_fn = overlay_blend_c_plane_opacity<uint8_t, 8>; break;
-        case 10: blend_fn = overlay_blend_c_plane_opacity<uint16_t, 10>; break;
-        case 12: blend_fn = overlay_blend_c_plane_opacity<uint16_t, 12>; break;
-        case 14: blend_fn = overlay_blend_c_plane_opacity<uint16_t, 14>; break;
-        case 16: blend_fn = overlay_blend_c_plane_opacity<uint16_t, 16>; break;
-        case 32: blend_fn = overlay_blend_c_plane_opacity_f; break;
+        case 8: blend_fn = overlay_blend_c_uint<false, uint8_t, 8>; break;
+        case 10: blend_fn = overlay_blend_c_uint<false, uint16_t, 10>; break;
+        case 12: blend_fn = overlay_blend_c_uint<false, uint16_t, 12>; break;
+        case 14: blend_fn = overlay_blend_c_uint<false, uint16_t, 14>; break;
+        case 16: blend_fn = overlay_blend_c_uint<false, uint16_t, 16>; break;
+        case 32: blend_fn = overlay_blend_c_float<false>; break;
         }
       }
+      // end of new, float precision inside masked overlays
+    }
+    else {
+      // old routies
+#ifdef INTEL_INTRINSICS
+      if (pixelsize == 2 && (env->GetCPUFlags() & CPUF_SSE4_1)) {
+        switch (bits_per_pixel) {
+        case 10: blend_fn = overlay_blend_sse41_plane_opacity_uint16<10>; break;
+        case 12: blend_fn = overlay_blend_sse41_plane_opacity_uint16<12>; break;
+        case 14: blend_fn = overlay_blend_sse41_plane_opacity_uint16<14>; break;
+        case 16: blend_fn = overlay_blend_sse41_plane_opacity_uint16<16>; break;
+        }
+      }
+      else if (pixelsize == 1 && (env->GetCPUFlags() & CPUF_SSE2)) {
+        blend_fn = overlay_blend_sse2_plane_opacity;
+      }
+      else
+#ifdef X86_32
+        if (pixelsize == 1 && (env->GetCPUFlags() & CPUF_MMX)) {
+          blend_fn = overlay_blend_mmx_plane_opacity;
+        }
+        else
+#endif
+#endif // INTEL_INTRINSICS
+        {
+          switch (bits_per_pixel) {
+          case 8: blend_fn = overlay_blend_c_plane_opacity<uint8_t, 8>; break;
+          case 10: blend_fn = overlay_blend_c_plane_opacity<uint16_t, 10>; break;
+          case 12: blend_fn = overlay_blend_c_plane_opacity<uint16_t, 12>; break;
+          case 14: blend_fn = overlay_blend_c_plane_opacity<uint16_t, 14>; break;
+          case 16: blend_fn = overlay_blend_c_plane_opacity<uint16_t, 16>; break;
+          }
+        }
+    }
 
     if (blend_fn == nullptr)
       env->ThrowError("Blend: no valid internal function");
 
     for (int p = planeindex_from; p <= planeindex_to; p++) {
-      blend_fn(base->GetPtrByIndex(p), overlay->GetPtrByIndex(p), base->GetPitchByIndex(p), overlay->GetPitchByIndex(p), (w >> base->xSubSamplingShifts[p]), h >> base->ySubSamplingShifts[p], opacity, opacity_f);
+      // no mask ptr
+      blend_fn(
+        base->GetPtrByIndex(p), overlay->GetPtrByIndex(p), nullptr,
+        base->GetPitchByIndex(p), overlay->GetPitchByIndex(p), 0,
+        (w >> base->xSubSamplingShifts[p]), h >> base->ySubSamplingShifts[p], opacity, opacity_f);
     }
 
   }
diff --git a/avs_core/filters/overlay/blend_common.cpp b/avs_core/filters/overlay/blend_common.cpp
index ddb27fbb78..05baf6f6dd 100644
--- a/avs_core/filters/overlay/blend_common.cpp
+++ b/avs_core/filters/overlay/blend_common.cpp
@@ -47,43 +47,123 @@
  ********* Mode: Blend ********
  ******************************/
 
-template<typename pixel_t, int bits_per_pixel>
-void overlay_blend_c_plane_masked(BYTE *p1, const BYTE *p2, const BYTE *mask,
-                                  const int p1_pitch, const int p2_pitch, const int mask_pitch,
-                                  const int width, const int height)
+// 32 bit float mask calculation inside
+template<bool has_mask, typename pixel_t, int bits_per_pixel>
+void overlay_blend_c_uint(BYTE* p1, const BYTE* p2, const BYTE* mask,
+  const int p1_pitch, const int p2_pitch, const int mask_pitch,
+  const int width, const int height, const int opacity, const float opacity_f)
 {
+  const int max_pixel_value = (1 << bits_per_pixel) - 1;
+  auto factor = 1.0f / max_pixel_value;
+  factor = factor * opacity_f;
+
   for (int y = 0; y < height; y++) {
     for (int x = 0; x < width; x++) {
-      int new_mask = reinterpret_cast<const pixel_t *>(mask)[x];
-      pixel_t p1x = reinterpret_cast<pixel_t *>(p1)[x];
-      pixel_t p2x = reinterpret_cast<const pixel_t *>(p2)[x];
-      pixel_t result;
-      if constexpr(bits_per_pixel == 8)
-        result = (pixel_t)overlay_blend_c_core_8((BYTE)p1x, (BYTE)p2x, new_mask);
-      else
-        result = (pixel_t)overlay_blend_c_core_16<bits_per_pixel>((uint16_t)p1x, (uint16_t)p2x, new_mask);
-      reinterpret_cast<pixel_t *>(p1)[x] = result;
+      const float new_mask = has_mask ? (float)reinterpret_cast<const pixel_t*>(mask)[x] * factor : factor;
+      auto result = overlay_blend_c_core_simple(
+        reinterpret_cast<pixel_t*>(p1)[x],
+        reinterpret_cast<const pixel_t*>(p2)[x],
+        new_mask);
+      reinterpret_cast<pixel_t*>(p1)[x] = (pixel_t)(result + 0.5f);
     }
 
-    p1   += p1_pitch;
-    p2   += p2_pitch;
-    mask += mask_pitch;
+    p1 += p1_pitch;
+    p2 += p2_pitch;
+    if(has_mask)
+      mask += mask_pitch;
   }
 }
 
-void overlay_blend_c_plane_masked_f(BYTE *p1, const BYTE *p2, const BYTE *mask,
+// instantiate
+// w/o mask
+template void overlay_blend_c_uint<false, uint8_t, 8>(BYTE* p1, const BYTE* p2, const BYTE* mask,
+  const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f);
+template void overlay_blend_c_uint<false, uint16_t, 10>(BYTE* p1, const BYTE* p2, const BYTE* mask,
+  const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f);
+template void overlay_blend_c_uint<false, uint16_t, 12>(BYTE* p1, const BYTE* p2, const BYTE* mask,
+  const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f);
+template void overlay_blend_c_uint<false, uint16_t, 14>(BYTE* p1, const BYTE* p2, const BYTE* mask,
+  const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f);
+template void overlay_blend_c_uint<false, uint16_t, 16>(BYTE* p1, const BYTE* p2, const BYTE* mask,
+  const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f);
+// w/ mask
+template void overlay_blend_c_uint<true, uint8_t, 8>(BYTE* p1, const BYTE* p2, const BYTE* mask,
+  const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f);
+template void overlay_blend_c_uint<true, uint16_t, 10>(BYTE* p1, const BYTE* p2, const BYTE* mask,
+  const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f);
+template void overlay_blend_c_uint<true, uint16_t, 12>(BYTE* p1, const BYTE* p2, const BYTE* mask,
+  const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f);
+template void overlay_blend_c_uint<true, uint16_t, 14>(BYTE* p1, const BYTE* p2, const BYTE* mask,
+  const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f);
+template void overlay_blend_c_uint<true, uint16_t, 16>(BYTE* p1, const BYTE* p2, const BYTE* mask,
+  const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f);
+
+void overlay_blend_c_plane_masked_f(BYTE* p1, const BYTE* p2, const BYTE* mask,
   const int p1_pitch, const int p2_pitch, const int mask_pitch,
-  const int width, const int height) {
+  const int width, const int height, const int /*opacity*/, const float /*opacity_f*/) {
 
   typedef float pixel_t;
   for (int y = 0; y < height; y++) {
     for (int x = 0; x < width; x++) {
-      pixel_t new_mask = reinterpret_cast<const pixel_t *>(mask)[x];
-      pixel_t p1x = reinterpret_cast<pixel_t *>(p1)[x];
-      pixel_t p2x = reinterpret_cast<const pixel_t *>(p2)[x];
-      pixel_t result = p1x + (p2x-p1x)*new_mask; // p1x*(1-new_mask) + p2x*mask
+      pixel_t new_mask = reinterpret_cast<const pixel_t*>(mask)[x];
+      pixel_t p1x = reinterpret_cast<pixel_t*>(p1)[x];
+      pixel_t p2x = reinterpret_cast<const pixel_t*>(p2)[x];
+      pixel_t result = p1x + (p2x - p1x) * new_mask; // p1x*(1-new_mask) + p2x*mask
 
       //pixel_t result = overlay_blend_c_core(reinterpret_cast<pixel_t *>(p1)[x], reinterpret_cast<pixel_t *>(p2)[x], static_cast<int>(reinterpret_cast<pixel_t *>(mask)[x]));
+      reinterpret_cast<pixel_t*>(p1)[x] = result;
+    }
+
+    p1 += p1_pitch;
+    p2 += p2_pitch;
+    mask += mask_pitch;
+  }
+}
+
+template<bool has_mask>
+void overlay_blend_c_float(BYTE* p1, const BYTE* p2, const BYTE* mask,
+  const int p1_pitch, const int p2_pitch, const int mask_pitch,
+  const int width, const int height, const int /*opacity*/, const float opacity_f) {
+
+  for (int y = 0; y < height; y++) {
+    for (int x = 0; x < width; x++) {
+      auto new_mask = has_mask ? reinterpret_cast<const float*>(mask)[x] * opacity_f : opacity_f;
+      auto p1x = reinterpret_cast<float*>(p1)[x];
+      auto p2x = reinterpret_cast<const float*>(p2)[x];
+      auto result = p1x + (p2x - p1x) * new_mask; // p1x*(1-new_mask) + p2x*mask
+      reinterpret_cast<float*>(p1)[x] = result;
+    }
+
+    p1 += p1_pitch;
+    p2 += p2_pitch;
+    if constexpr (has_mask)
+      mask += mask_pitch;
+  }
+}
+
+// instantiate
+template void overlay_blend_c_float<false>(BYTE* p1, const BYTE* p2, const BYTE* mask,
+  const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int /*opacity*/, const float opacity_f);
+template void overlay_blend_c_float<true>(BYTE* p1, const BYTE* p2, const BYTE* mask,
+  const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int /*opacity*/, const float opacity_f);
+
+
+
+template<typename pixel_t, int bits_per_pixel>
+void overlay_blend_c_plane_masked(BYTE *p1, const BYTE *p2, const BYTE *mask,
+                                  const int p1_pitch, const int p2_pitch, const int mask_pitch,
+                                  const int width, const int height, const int /*opacity*/, const float /*opacity_f*/)
+{
+  for (int y = 0; y < height; y++) {
+    for (int x = 0; x < width; x++) {
+      int new_mask = reinterpret_cast<const pixel_t *>(mask)[x];
+      pixel_t p1x = reinterpret_cast<pixel_t *>(p1)[x];
+      pixel_t p2x = reinterpret_cast<const pixel_t *>(p2)[x];
+      pixel_t result;
+      if constexpr(bits_per_pixel == 8)
+        result = (pixel_t)overlay_blend_c_core_8((BYTE)p1x, (BYTE)p2x, new_mask);
+      else
+        result = (pixel_t)overlay_blend_c_core_16<bits_per_pixel>((uint16_t)p1x, (uint16_t)p2x, new_mask);
       reinterpret_cast<pixel_t *>(p1)[x] = result;
     }
 
@@ -93,27 +173,28 @@ void overlay_blend_c_plane_masked_f(BYTE *p1, const BYTE *p2, const BYTE *mask,
   }
 }
 
+
 // instantiate
 template void overlay_blend_c_plane_masked<uint8_t, 8>(BYTE *p1, const BYTE *p2, const BYTE *mask,
   const int p1_pitch, const int p2_pitch, const int mask_pitch,
-  const int width, const int height);
+  const int width, const int height, const int opacity, const float opacity_f);
 template void overlay_blend_c_plane_masked<uint16_t,10>(BYTE *p1, const BYTE *p2, const BYTE *mask,
   const int p1_pitch, const int p2_pitch, const int mask_pitch,
-  const int width, const int height);
+  const int width, const int heigh, const int opacity, const float opacity_ft);
 template void overlay_blend_c_plane_masked<uint16_t,12>(BYTE *p1, const BYTE *p2, const BYTE *mask,
   const int p1_pitch, const int p2_pitch, const int mask_pitch,
-  const int width, const int height);
+  const int width, const int height, const int opacity, const float opacity_f);
 template void overlay_blend_c_plane_masked<uint16_t,14>(BYTE *p1, const BYTE *p2, const BYTE *mask,
   const int p1_pitch, const int p2_pitch, const int mask_pitch,
-  const int width, const int height);
+  const int width, const int height, const int opacity, const float opacity_f);
 template void overlay_blend_c_plane_masked<uint16_t,16>(BYTE *p1, const BYTE *p2, const BYTE *mask,
   const int p1_pitch, const int p2_pitch, const int mask_pitch,
-  const int width, const int height);
+  const int width, const int height, const int opacity, const float opacity_f);
 
 
 template<typename pixel_t, int bits_per_pixel>
-void overlay_blend_c_plane_opacity(BYTE *p1, const BYTE *p2,
-                                   const int p1_pitch, const int p2_pitch,
+void overlay_blend_c_plane_opacity(BYTE *p1, const BYTE *p2, const BYTE* /*mask*/,
+                                   const int p1_pitch, const int p2_pitch, const int /*mask_pitch*/,
                                    const int width, const int height, const int opacity, const float opacity_f) {
 
   AVS_UNUSED(opacity_f);
@@ -140,8 +221,8 @@ void overlay_blend_c_plane_opacity(BYTE *p1, const BYTE *p2,
   }
 }
 
-void overlay_blend_c_plane_opacity_f(BYTE *p1, const BYTE *p2,
-  const int p1_pitch, const int p2_pitch,
+void overlay_blend_c_plane_opacity_f(BYTE *p1, const BYTE *p2, const BYTE* /*mask*/,
+  const int p1_pitch, const int p2_pitch, const int /*mask_pitch*/,
   const int width, const int height,const int opacity, const float opacity_f) {
 
   AVS_UNUSED(opacity);
@@ -160,20 +241,20 @@ void overlay_blend_c_plane_opacity_f(BYTE *p1, const BYTE *p2,
 }
 
 // instantiate
-template void overlay_blend_c_plane_opacity<uint8_t, 8>(BYTE *p1, const BYTE *p2,
-  const int p1_pitch, const int p2_pitch,
+template void overlay_blend_c_plane_opacity<uint8_t, 8>(BYTE *p1, const BYTE *p2, const BYTE* mask,
+  const int p1_pitch, const int p2_pitch, const int mask_pitch,
   const int width, const int height, const int opacity, const float opacity_f);
-template void overlay_blend_c_plane_opacity<uint16_t,10>(BYTE *p1, const BYTE *p2,
-  const int p1_pitch, const int p2_pitch,
+template void overlay_blend_c_plane_opacity<uint16_t,10>(BYTE *p1, const BYTE *p2, const BYTE* mask,
+  const int p1_pitch, const int p2_pitch, const int mask_pitch,
   const int width, const int height, const int opacity, const float opacity_f);
-template void overlay_blend_c_plane_opacity<uint16_t,12>(BYTE *p1, const BYTE *p2,
-  const int p1_pitch, const int p2_pitch,
+template void overlay_blend_c_plane_opacity<uint16_t,12>(BYTE *p1, const BYTE *p2, const BYTE* mask,
+  const int p1_pitch, const int p2_pitch, const int mask_pitch,
   const int width, const int height, const int opacity, const float opacity_f);
-template void overlay_blend_c_plane_opacity<uint16_t,14>(BYTE *p1, const BYTE *p2,
-  const int p1_pitch, const int p2_pitch,
+template void overlay_blend_c_plane_opacity<uint16_t,14>(BYTE *p1, const BYTE *p2, const BYTE* mask,
+  const int p1_pitch, const int p2_pitch, const int mask_pitch,
   const int width, const int height, const int opacity, const float opacity_f);
-template void overlay_blend_c_plane_opacity<uint16_t,16>(BYTE *p1, const BYTE *p2,
-  const int p1_pitch, const int p2_pitch,
+template void overlay_blend_c_plane_opacity<uint16_t,16>(BYTE *p1, const BYTE *p2, const BYTE* mask,
+  const int p1_pitch, const int p2_pitch, const int mask_pitch,
   const int width, const int height, const int opacity, const float opacity_f);
 
 
diff --git a/avs_core/filters/overlay/blend_common.h b/avs_core/filters/overlay/blend_common.h
index ed95a0131f..258276036f 100644
--- a/avs_core/filters/overlay/blend_common.h
+++ b/avs_core/filters/overlay/blend_common.h
@@ -40,14 +40,6 @@
 #include <avs/types.h>
 #include <avs/config.h>
 
-using overlay_blend_plane_masked_t = void(BYTE* p1, const BYTE* p2, const BYTE* mask,
-  const int p1_pitch, const int p2_pitch, const int mask_pitch,
-  const int width, const int height);
-
-using overlay_blend_plane_opacity_t = void(BYTE* p1, const BYTE* p2,
-  const int p1_pitch, const int p2_pitch,
-  const int width, const int height, const int opacity, const float opacity_f);
-
 using overlay_blend_plane_masked_opacity_t = void(BYTE* p1, const BYTE* p2, const BYTE* mask,
   const int p1_pitch, const int p2_pitch, const int mask_pitch,
   const int width, const int height, const int opacity, const float opacity_f);
@@ -79,6 +71,12 @@ AVS_FORCEINLINE static uint16_t overlay_blend_c_core_16(const uint16_t p1, const
     return (uint16_t)(((p1 << bits_per_pixel) + (p2 - p1)*mask + half_rounder) >> bits_per_pixel);
 }
 
+AVS_FORCEINLINE static float overlay_blend_c_core_simple(const int p1, const int p2, const float factor) {
+  //  p1*(1-mask_f) + p2*mask_f -> p1 + (p2-p1)*mask_f
+  const float res = p1 + (p2 - p1) * factor;
+  return res;
+}
+
 AVS_FORCEINLINE static float overlay_blend_c_core_f(const float p1, const float p2, const float mask) {
   return p1 + (p2-p1)*mask; // p1*(1-mask) + p2*mask
 }
@@ -103,19 +101,29 @@ AVS_FORCEINLINE pixel_t overlay_blend_opaque_c_core(const pixel_t p1, const pixe
 // Mode: Overlay
 void overlay_blend_c_plane_masked_f(BYTE *p1, const BYTE *p2, const BYTE *mask,
   const int p1_pitch, const int p2_pitch, const int mask_pitch,
-  const int width, const int height);
+  const int width, const int height, const int opacity, const float opacity_f);
 
 template<typename pixel_t, int bits_per_pixel>
 void overlay_blend_c_plane_masked(BYTE *p1, const BYTE *p2, const BYTE *mask,
                                   const int p1_pitch, const int p2_pitch, const int mask_pitch,
-                                  const int width, const int height);
+                                  const int width, const int height, const int opacity, const float opacity_f);
+
+template<bool has_mask, typename pixel_t, int bits_per_pixel>
+void overlay_blend_c_uint(BYTE* p1, const BYTE* p2, const BYTE* mask,
+  const int p1_pitch, const int p2_pitch, const int mask_pitch,
+  const int width, const int height, const int opacity, const float opacity_f);
+
+template<bool has_mask>
+void overlay_blend_c_float(BYTE* p1, const BYTE* p2, const BYTE* mask,
+  const int p1_pitch, const int p2_pitch, const int mask_pitch,
+  const int width, const int height, const int opacity, const float opacity_f);
 
 template<typename pixel_t, int bits_per_pixel>
-void overlay_blend_c_plane_opacity(BYTE *p1, const BYTE *p2,
-                                   const int p1_pitch, const int p2_pitch,
+void overlay_blend_c_plane_opacity(BYTE *p1, const BYTE *p2, const BYTE* mask,
+                                   const int p1_pitch, const int p2_pitch, const int mask_pitch,
                                    const int width, const int height, const int opacity, const float opacity_f);
-void overlay_blend_c_plane_opacity_f(BYTE *p1, const BYTE *p2,
-  const int p1_pitch, const int p2_pitch,
+void overlay_blend_c_plane_opacity_f(BYTE *p1, const BYTE *p2, const BYTE* mask,
+  const int p1_pitch, const int p2_pitch, const int mask_pitch,
   const int width, const int height, const int opacity, const float opacity_f);
 
 template<typename pixel_t, int bits_per_pixel>
diff --git a/avs_core/filters/overlay/intel/blend_common_avx2.cpp b/avs_core/filters/overlay/intel/blend_common_avx2.cpp
new file mode 100644
index 0000000000..1d88c735b9
--- /dev/null
+++ b/avs_core/filters/overlay/intel/blend_common_avx2.cpp
@@ -0,0 +1,230 @@
+// Avisynth+
+// https://avs-plus.net
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA, or visit
+// http://www.gnu.org/copyleft/gpl.html .
+//
+// Linking Avisynth statically or dynamically with other modules is making a
+// combined work based on Avisynth.  Thus, the terms and conditions of the GNU
+// General Public License cover the whole combination.
+//
+// As a special exception, the copyright holders of Avisynth give you
+// permission to link Avisynth with independent modules that communicate with
+// Avisynth solely through the interfaces defined in avisynth.h, regardless of the license
+// terms of these independent modules, and to copy and distribute the
+// resulting combined work under terms of your choice, provided that
+// every copy of the combined work is accompanied by a complete copy of
+// the source code of Avisynth (the version of Avisynth used to produce the
+// combined work), being distributed under the terms of the GNU General
+// Public License plus this exception.  An independent module is a module
+// which is not derived from or based on Avisynth, such as 3rd-party filters,
+// import and export plugins, or graphical user interfaces.
+
+#include "avisynth.h"
+#include "blend_common_avx2.h"
+#include "../blend_common.h"
+
+#include <stdint.h>
+
+#ifdef AVS_WINDOWS
+#include <intrin.h>
+#else
+#include <x86intrin.h>
+#endif
+
+template<typename pixel_t>
+static AVS_FORCEINLINE __m256 Eightpixels_to_floats(const pixel_t* src) {
+  __m256i srci;
+  if constexpr (sizeof(pixel_t) == 1) {
+    srci = _mm256_cvtepu8_epi32(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(src)));
+  }
+  else {
+    srci = _mm256_cvtepu16_epi32(_mm_loadu_si128(reinterpret_cast<const __m128i*>(src)));
+  }
+  return _mm256_cvtepi32_ps(srci);
+}
+
+template<typename pixel_t, int bits_per_pixel>
+static AVS_FORCEINLINE void Store_Eightpixels(pixel_t* dst, __m256 what, const __m256 rounder) {
+  what = _mm256_add_ps(what, rounder); // round
+  __m256i si32 = _mm256_cvttps_epi32(what); // truncate
+  __m256i result = _mm256_packus_epi32(si32, si32); // only low 8 words needed
+  result = _mm256_permute4x64_epi64(result, (0 << 0) | (2 << 2) | (1 << 4) | (3 << 6));
+  __m128i result128 = _mm256_castsi256_si128(result);
+  if constexpr (sizeof(pixel_t) == 1) {
+    __m128i result64 = _mm_packus_epi16(result128, result128);
+    _mm_storel_epi64(reinterpret_cast<__m128i*>(dst), result64);
+  } else {
+    /* when mask is 0..1 checked then this is not possible
+    if constexpr (bits_per_pixel < 16) { // otherwise no clamp needed
+      constexpr int max_pixel_value = (1 << bits_per_pixel) - 1;
+      auto max_pixel_value_v = _mm_set1_epi16(static_cast<uint16_t>(max_pixel_value));
+      result128 = _mm_min_epu16(result128, max_pixel_value_v);
+    }
+    */
+    _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), result128);
+  }
+}
+
+AVS_FORCEINLINE static __m256 overlay_blend_avx2_core_new(const __m256& p1_f, const __m256& p2_f, const __m256& factor) {
+  /*
+  //  p1*(1-mask_f) + p2*mask_f -> p1 + (p2-p1)*mask_f
+  constexpr int max_pixel_value = (1 << bits_per_pixel) - 1;
+  constexpr float factor = 1.0f / max_pixel_value;
+  constexpr float half_rounder = 0.5f;
+  const float mask_f = mask * factor;
+  const float res = p1 + (p2 - p1) * mask_f;
+  int result = (int)(res + 0.5f);
+  */
+  // rounding not here, but before storage
+  auto res = _mm256_add_ps(p1_f, _mm256_mul_ps(_mm256_sub_ps(p2_f, p1_f), factor));
+  return res;
+} 
+
+template<bool has_mask, typename pixel_t, int bits_per_pixel>
+void overlay_blend_avx2_uint(BYTE* p1, const BYTE* p2, const BYTE* mask,
+  const int p1_pitch, const int p2_pitch, const int mask_pitch,
+  const int width, const int height, const int opacity, const float opacity_f)
+{
+
+  auto rounder = _mm256_set1_ps(0.5f);
+  const int max_pixel_value = (1 << bits_per_pixel) - 1;
+  auto factor = 1.0f / max_pixel_value;
+  factor = factor * opacity_f;
+  auto factor_v = _mm256_set1_ps(factor);
+
+  const int realwidth = width * sizeof(pixel_t);
+
+  // 2x8 pixels at a time
+  constexpr int bytes_per_cycle = 16 * sizeof(pixel_t);
+  int wMod16 = (realwidth / bytes_per_cycle) * bytes_per_cycle;
+
+  for (int y = 0; y < height; y++) {
+    for (int x = 0; x < wMod16; x += bytes_per_cycle) {
+      auto unpacked_p1 = Eightpixels_to_floats<pixel_t>((const pixel_t*)(p1 + x)); // 8x32
+      auto unpacked_p2 = Eightpixels_to_floats<pixel_t>((const pixel_t*)(p2 + x)); // 8x32
+
+      auto unpacked_p1_2 = Eightpixels_to_floats<pixel_t>((const pixel_t*)(p1 + x + bytes_per_cycle / 2)); // 8x32
+      auto unpacked_p2_2 = Eightpixels_to_floats<pixel_t>((const pixel_t*)(p2 + x + bytes_per_cycle / 2)); // 8x32
+
+      __m256 result, result_2;
+      if constexpr (has_mask) {
+        auto unpacked_mask = Eightpixels_to_floats<pixel_t>((const pixel_t*)(mask + x)); // 8x32
+        unpacked_mask = _mm256_mul_ps(unpacked_mask, factor_v);
+        result = overlay_blend_avx2_core_new(unpacked_p1, unpacked_p2, unpacked_mask);
+        
+        auto unpacked_mask_2 = Eightpixels_to_floats<pixel_t>((const pixel_t*)(mask + x + bytes_per_cycle / 2)); // 8x32
+        unpacked_mask_2 = _mm256_mul_ps(unpacked_mask_2, factor_v);
+        result_2 = overlay_blend_avx2_core_new(unpacked_p1_2, unpacked_p2_2, unpacked_mask_2);
+      }
+      else {
+        result = overlay_blend_avx2_core_new(unpacked_p1, unpacked_p2, factor_v);
+        result_2 = overlay_blend_avx2_core_new(unpacked_p1_2, unpacked_p2_2, factor_v);
+      }
+
+      Store_Eightpixels<pixel_t, bits_per_pixel>((pixel_t*)(p1 + x), result, rounder);
+      Store_Eightpixels<pixel_t, bits_per_pixel>((pixel_t*)(p1 + x + bytes_per_cycle / 2), result_2, rounder);
+    }
+
+    // Leftover value
+
+    for (int x = wMod16 / sizeof(pixel_t); x < width; x++) {
+      const float new_factor = has_mask ? static_cast<float>(reinterpret_cast<const pixel_t*>(mask)[x]) * factor : factor;
+      auto result = overlay_blend_c_core_simple(reinterpret_cast<pixel_t*>(p1)[x], reinterpret_cast<const pixel_t*>(p2)[x], new_factor);
+      reinterpret_cast<pixel_t*>(p1)[x] = (pixel_t)(result + 0.5f);
+    }
+
+    p1 += p1_pitch;
+    p2 += p2_pitch;
+    if (has_mask)
+      mask += mask_pitch;
+  }
+}
+
+// instantiate
+// mask yes/no
+template void overlay_blend_avx2_uint<true, uint8_t, 8>(BYTE* p1, const BYTE* p2, const BYTE* mask,
+  const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f);
+template void overlay_blend_avx2_uint<true, uint16_t, 10>(BYTE* p1, const BYTE* p2, const BYTE* mask,
+  const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f);
+template void overlay_blend_avx2_uint<true, uint16_t, 12>(BYTE* p1, const BYTE* p2, const BYTE* mask,
+  const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f);
+template void overlay_blend_avx2_uint<true, uint16_t, 14>(BYTE* p1, const BYTE* p2, const BYTE* mask,
+  const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f);
+template void overlay_blend_avx2_uint<true, uint16_t, 16>(BYTE* p1, const BYTE* p2, const BYTE* mask,
+  const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f);
+//--
+template void overlay_blend_avx2_uint<false, uint8_t, 8>(BYTE* p1, const BYTE* p2, const BYTE* mask,
+  const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f);
+template void overlay_blend_avx2_uint<false, uint16_t, 10>(BYTE* p1, const BYTE* p2, const BYTE* mask,
+  const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f);
+template void overlay_blend_avx2_uint<false, uint16_t, 12>(BYTE* p1, const BYTE* p2, const BYTE* mask,
+  const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f);
+template void overlay_blend_avx2_uint<false, uint16_t, 14>(BYTE* p1, const BYTE* p2, const BYTE* mask,
+  const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f);
+template void overlay_blend_avx2_uint<false, uint16_t, 16>(BYTE* p1, const BYTE* p2, const BYTE* mask,
+  const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f);
+
+template<bool has_mask>
+void overlay_blend_avx2_float(BYTE* p1, const BYTE* p2, const BYTE* mask,
+  const int p1_pitch, const int p2_pitch, const int mask_pitch,
+  const int width, const int height, const int opacity, const float opacity_f)
+{
+
+  const int realwidth = width * sizeof(float);
+
+  int wMod32 = (realwidth / 32) * 32;
+  auto opacity_v = _mm256_set1_ps(opacity_f);
+
+  for (int y = 0; y < height; y++) {
+    for (int x = 0; x < wMod32; x += 32) {
+      auto p1_f = _mm256_loadu_ps(reinterpret_cast<const float*>(p1 + x));
+      auto p2_f = _mm256_loadu_ps(reinterpret_cast<const float*>(p2 + x));
+      __m256 new_mask;
+      if constexpr (has_mask) {
+        new_mask = _mm256_loadu_ps(reinterpret_cast<const float*>(mask + x));
+        new_mask = _mm256_mul_ps(new_mask, opacity_v);
+      }
+      else {
+        new_mask = opacity_v;
+      }
+      auto result = _mm256_add_ps(p1_f, _mm256_mul_ps(_mm256_sub_ps(p2_f, p1_f), new_mask)); // p1*(1-mask) + p2*mask = p1+(p2-p1)*mask
+
+      _mm256_storeu_ps(reinterpret_cast<float*>(p1 + x), result);
+    }
+
+    // Leftover value
+
+    for (int x = wMod32 / sizeof(float); x < width; x++) {
+      auto new_mask = has_mask ? reinterpret_cast<const float*>(mask)[x] * opacity_f : opacity_f;
+      auto p1x = reinterpret_cast<float*>(p1)[x];
+      auto p2x = reinterpret_cast<const float*>(p2)[x];
+      auto result = p1x + (p2x - p1x) * new_mask; // p1x*(1-new_mask) + p2x*mask
+      reinterpret_cast<float*>(p1)[x] = result;
+    }
+
+
+    p1 += p1_pitch;
+    p2 += p2_pitch;
+    if constexpr (has_mask)
+      mask += mask_pitch;
+  }
+}
+
+template void overlay_blend_avx2_float<false>(BYTE* p1, const BYTE* p2, const BYTE* mask,
+  const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f);
+template void overlay_blend_avx2_float<true>(BYTE* p1, const BYTE* p2, const BYTE* mask,
+  const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f);
+
diff --git a/avs_core/filters/overlay/intel/blend_common_avx2.h b/avs_core/filters/overlay/intel/blend_common_avx2.h
new file mode 100644
index 0000000000..e3948c19f2
--- /dev/null
+++ b/avs_core/filters/overlay/intel/blend_common_avx2.h
@@ -0,0 +1,40 @@
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA, or visit
+// http://www.gnu.org/copyleft/gpl.html .
+//
+// Linking Avisynth statically or dynamically with other modules is making a
+// combined work based on Avisynth.  Thus, the terms and conditions of the GNU
+// General Public License cover the whole combination.
+//
+// As a special exception, the copyright holders of Avisynth give you
+// permission to link Avisynth with independent modules that communicate with
+// Avisynth solely through the interfaces defined in avisynth.h, regardless of the license
+// terms of these independent modules, and to copy and distribute the
+// resulting combined work under terms of your choice, provided that
+// every copy of the combined work is accompanied by a complete copy of
+// the source code of Avisynth (the version of Avisynth used to produce the
+// combined work), being distributed under the terms of the GNU General
+// Public License plus this exception.  An independent module is a module
+// which is not derived from or based on Avisynth, such as 3rd-party filters,
+// import and export plugins, or graphical user interfaces.
+
+#include "avisynth.h"
+#include <stdint.h>
+
+#ifndef __blend_common_avx2_h
+#define __blend_common_avx2_h
+
+template<bool has_mask, typename pixel_t, int bits_per_pixel>
+void overlay_blend_avx2_uint(BYTE* p1, const BYTE* p2, const BYTE* mask,
+  const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f);
+
+template<bool has_mask>
+void overlay_blend_avx2_float(BYTE* p1, const BYTE* p2, const BYTE* mask,
+  const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f);
+
+#endif // __blend_common_avx2_h
diff --git a/avs_core/filters/overlay/intel/blend_common_sse.cpp b/avs_core/filters/overlay/intel/blend_common_sse.cpp
index 8db35b5ff4..731765f475 100644
--- a/avs_core/filters/overlay/intel/blend_common_sse.cpp
+++ b/avs_core/filters/overlay/intel/blend_common_sse.cpp
@@ -38,6 +38,7 @@
 
 #include "blend_common_sse.h"
 #include "../blend_common.h"
+#include "../../../core/internal.h"
 
 // Intrinsics for SSE4.1, SSSE3, SSE3, SSE2, ISSE and MMX
 #include <emmintrin.h>
@@ -126,11 +127,6 @@ AVS_FORCEINLINE static __m128i overlay_merge_mask_sse41_uint16(const __m128i& p1
   return t2;
 }
 
-AVS_FORCEINLINE static __m128i overlay_merge_mask_sse2_float(const __m128i& p1, const __m128i& p2) {
-  __m128 mulres = _mm_mul_ps(_mm_castsi128_ps(p1), _mm_castsi128_ps(p2));
-  return _mm_castps_si128(mulres);
-}
-
 
 /********************************
  ********* Blend Opaque *********
@@ -159,7 +155,7 @@ AVS_FORCEINLINE __m128i overlay_blend_opaque_sse2_core(const __m128i& p1, const
 #ifdef X86_32
 void overlay_blend_mmx_plane_masked(BYTE *p1, const BYTE *p2, const BYTE *mask,
                                     const int p1_pitch, const int p2_pitch, const int mask_pitch,
-                                    const int width, const int height) {
+                                    const int width, const int height, const int opacity, const float opacity_f) {
         BYTE* original_p1 = p1;
   const BYTE* original_p2 = p2;
   const BYTE* original_mask = mask;
@@ -211,17 +207,10 @@ static AVS_FORCEINLINE __m128i _MM_BLENDV_EPI8(__m128i const &a, __m128i const &
   return _mm_or_si128(_mm_and_si128(selector, b), _mm_andnot_si128(selector, a));
 }
 
-// non-existant in simd
-static AVS_FORCEINLINE __m128i _MM_CMPLE_EPU16(__m128i x, __m128i y)
-{
-  // Returns 0xFFFF where x <= y:
-  return _mm_cmpeq_epi16(_mm_subs_epu16(x, y), _mm_setzero_si128());
-}
-
 // uint8_t only
 void overlay_blend_sse2_plane_masked(BYTE *p1, const BYTE *p2, const BYTE *mask,
                                      const int p1_pitch, const int p2_pitch, const int mask_pitch,
-                                     const int width, const int height)
+                                     const int width, const int height, const int opacity, const float opacity_f)
 {
   __m128i v128;
   v128 = _mm_set1_epi16(0x0080); // rounder
@@ -288,7 +277,7 @@ __attribute__((__target__("sse4.1")))
 #endif
 void overlay_blend_sse41_plane_masked(BYTE *p1, const BYTE *p2, const BYTE *mask,
   const int p1_pitch, const int p2_pitch, const int mask_pitch,
-  const int width, const int height)
+  const int width, const int height, const int opacity, const float opacity_f)
 {
   __m128i v128;
   if constexpr (sizeof(pixel_t) == 1)
@@ -397,45 +386,344 @@ void overlay_blend_sse41_plane_masked(BYTE *p1, const BYTE *p2, const BYTE *mask
   }
 }
 
-void overlay_blend_sse2_plane_masked_float(BYTE *p1, const BYTE *p2, const BYTE *mask,
+// instantiate
+template void overlay_blend_sse41_plane_masked<uint8_t, 8>(BYTE* p1, const BYTE* p2, const BYTE* mask, const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f);
+// 16 bit: SSE4 only
+template void overlay_blend_sse41_plane_masked<uint16_t, 10>(BYTE* p1, const BYTE* p2, const BYTE* mask, const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f);
+template void overlay_blend_sse41_plane_masked<uint16_t, 12>(BYTE* p1, const BYTE* p2, const BYTE* mask, const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f);
+template void overlay_blend_sse41_plane_masked<uint16_t, 14>(BYTE* p1, const BYTE* p2, const BYTE* mask, const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f);
+template void overlay_blend_sse41_plane_masked<uint16_t, 16>(BYTE* p1, const BYTE* p2, const BYTE* mask, const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f);
+
+
+template<typename pixel_t>
+#if defined(GCC) || defined(CLANG)
+__attribute__((__target__("sse4.1")))
+#endif
+static AVS_FORCEINLINE void Eightpixels_to_Eightfloats(const pixel_t* src, __m128& src_lo, __m128& src_hi, __m128i& zero) {
+  __m128i srci;
+  if constexpr (sizeof(pixel_t) == 1) {
+    srci = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(src));
+    srci = _mm_unpacklo_epi8(srci, zero);
+  }
+  else {
+    srci = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src));
+  }
+  src_lo = _mm_cvtepi32_ps(_mm_cvtepu16_epi32(srci));
+  src_hi = _mm_cvtepi32_ps(_mm_unpackhi_epi16(srci, zero));
+}
+
+template<typename pixel_t, int bits_per_pixel>
+#if defined(GCC) || defined(CLANG)
+__attribute__((__target__("sse4.1")))
+#endif
+static AVS_FORCEINLINE void Store_Eightpixels(pixel_t* dst, __m128 what_lo, __m128 what_hi, const __m128 rounder) {
+  what_lo = _mm_add_ps(what_lo, rounder); // round
+  what_hi = _mm_add_ps(what_hi, rounder); // round
+  auto si32_lo = _mm_cvttps_epi32(what_lo); // truncate
+  auto si32_hi = _mm_cvttps_epi32(what_hi); // truncate
+  auto result = _mm_packus_epi32(si32_lo, si32_hi); // 2x4x32bit -> 8x16
+  if constexpr (sizeof(pixel_t) == 1) {
+    __m128i result64 = _mm_packus_epi16(result, result); // 8x16bit -> 8x8
+    _mm_storel_epi64(reinterpret_cast<__m128i*>(dst), result64);
+  }
+  else {
+    /* when mask is 0..1 checked then this is not possible
+    if constexpr (bits_per_pixel < 16) { // otherwise no clamp needed
+      constexpr int max_pixel_value = (1 << bits_per_pixel) - 1;
+      auto max_pixel_value_v = _mm_set1_epi16(static_cast<uint16_t>(max_pixel_value));
+      result128 = _mm_min_epu16(result128, max_pixel_value_v);
+    }
+    */
+    _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), result);
+  }
+}
+
+template<typename pixel_t>
+#if defined(GCC) || defined(CLANG)
+__attribute__((__target__("sse2")))
+#endif
+static AVS_FORCEINLINE void Eightpixels_to_Eightfloats_sse2(const pixel_t* src, __m128& src_lo, __m128& src_hi, __m128i& zero) {
+  __m128i srci;
+  if constexpr (sizeof(pixel_t) == 1) {
+    srci = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(src));
+    srci = _mm_unpacklo_epi8(srci, zero);
+  }
+  else {
+    srci = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src));
+  }
+  src_lo = _mm_cvtepi32_ps(_mm_unpacklo_epi16(srci, zero));
+  src_hi = _mm_cvtepi32_ps(_mm_unpackhi_epi16(srci, zero));
+}
+
+template<typename pixel_t, int bits_per_pixel>
+#if defined(GCC) || defined(CLANG)
+__attribute__((__target__("sse2")))
+#endif
+static AVS_FORCEINLINE void Store_Eightpixels_sse2(pixel_t* dst, __m128 what_lo, __m128 what_hi, const __m128 rounder) {
+  what_lo = _mm_add_ps(what_lo, rounder); // round
+  what_hi = _mm_add_ps(what_hi, rounder); // round
+  auto si32_lo = _mm_cvttps_epi32(what_lo); // truncate
+  auto si32_hi = _mm_cvttps_epi32(what_hi); // truncate
+  if constexpr (sizeof(pixel_t) == 1) {
+    auto result = _mm_packs_epi32(si32_lo, si32_hi); // 2x4x32bit -> 8x16
+    __m128i result64 = _mm_packus_epi16(result, result); // 8x16bit -> 8x8
+    _mm_storel_epi64(reinterpret_cast<__m128i*>(dst), result64);
+  }
+  else {
+    auto result = _MM_PACKUS_EPI32(si32_lo, si32_hi); // 2x4x32bit -> 8x16
+      /* when mask is 0..1 checked then this is not possible
+    if constexpr (bits_per_pixel < 16) { // otherwise no clamp needed
+      constexpr int max_pixel_value = (1 << bits_per_pixel) - 1;
+      auto max_pixel_value_v = _mm_set1_epi16(static_cast<uint16_t>(max_pixel_value));
+      result128 = _mm_min_epu16(result128, max_pixel_value_v);
+    }
+    */
+    _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), result);
+  }
+}
+
+
+
+AVS_FORCEINLINE static __m128 overlay_blend_sse_core_new(const __m128& p1_f, const __m128& p2_f, const __m128& factor) {
+  /*
+  //  p1*(1-mask_f) + p2*mask_f -> p1 + (p2-p1)*mask_f
+  constexpr int max_pixel_value = (1 << bits_per_pixel) - 1;
+  constexpr float factor = 1.0f / max_pixel_value;
+  constexpr float half_rounder = 0.5f;
+  const float mask_f = mask * factor;
+  const float res = p1 + (p2 - p1) * mask_f;
+  int result = (int)(res + 0.5f);
+  */
+  // rounding not here, but before storage
+  auto res = _mm_add_ps(p1_f, _mm_mul_ps(_mm_sub_ps(p2_f, p1_f), factor));
+  return res;
+}
+
+template<bool has_mask, typename pixel_t, int bits_per_pixel>
+#if defined(GCC) || defined(CLANG)
+__attribute__((__target__("sse4.1")))
+#endif
+void overlay_blend_sse41_uint(BYTE* p1, const BYTE* p2, const BYTE* mask,
+  const int p1_pitch, const int p2_pitch, const int mask_pitch,
+  const int width, const int height, const int opacity, const float opacity_f)
+{
+
+  auto rounder = _mm_set1_ps(0.5f);
+  const int max_pixel_value = (1 << bits_per_pixel) - 1;
+  auto factor = 1.0f / max_pixel_value;
+  factor = factor * opacity_f;
+  auto factor_v = _mm_set1_ps(factor);
+
+  const int realwidth = width * sizeof(pixel_t);
+
+  // 8 pixels at a time
+  constexpr int bytes_per_cycle = 8 * sizeof(pixel_t);
+  int wMod8 = (realwidth / bytes_per_cycle) * bytes_per_cycle;
+
+  auto zero = _mm_setzero_si128();
+
+  for (int y = 0; y < height; y++) {
+    for (int x = 0; x < wMod8; x += bytes_per_cycle) {
+      __m128 unpacked_p1, unpacked_p1_2;
+      __m128 unpacked_p2, unpacked_p2_2;
+      Eightpixels_to_Eightfloats<pixel_t>((const pixel_t*)(p1 + x), unpacked_p1, unpacked_p1_2, zero); // 8x32
+      Eightpixels_to_Eightfloats<pixel_t>((const pixel_t*)(p2 + x), unpacked_p2, unpacked_p2_2, zero); // 8x32
+
+      __m128 result, result_2;
+      if constexpr (has_mask) {
+        __m128 unpacked_mask, unpacked_mask_2;
+        Eightpixels_to_Eightfloats<pixel_t>((const pixel_t*)(mask + x), unpacked_mask, unpacked_mask_2, zero); // 8x32
+        unpacked_mask = _mm_mul_ps(unpacked_mask, factor_v);
+        unpacked_mask_2 = _mm_mul_ps(unpacked_mask_2, factor_v);
+        result = overlay_blend_sse_core_new(unpacked_p1, unpacked_p2, unpacked_mask);
+        result_2 = overlay_blend_sse_core_new(unpacked_p1_2, unpacked_p2_2, unpacked_mask_2);
+      }
+      else {
+        result = overlay_blend_sse_core_new(unpacked_p1, unpacked_p2, factor_v);
+        result_2 = overlay_blend_sse_core_new(unpacked_p1_2, unpacked_p2_2, factor_v);
+      }
+
+      Store_Eightpixels<pixel_t, bits_per_pixel>((pixel_t*)(p1 + x), result, result_2, rounder);
+    }
+
+    // Leftover value
+
+    for (int x = wMod8 / sizeof(pixel_t); x < width; x++) {
+      const float new_factor = has_mask ? static_cast<float>(reinterpret_cast<const pixel_t*>(mask)[x]) * factor : factor;
+      auto result = overlay_blend_c_core_simple(reinterpret_cast<pixel_t*>(p1)[x], reinterpret_cast<const pixel_t*>(p2)[x], new_factor);
+      reinterpret_cast<pixel_t*>(p1)[x] = (pixel_t)(result + 0.5f);
+    }
+
+    p1 += p1_pitch;
+    p2 += p2_pitch;
+    if (has_mask)
+      mask += mask_pitch;
+  }
+}
+
+// instantiate
+// mask yes/no
+template void overlay_blend_sse41_uint<true, uint8_t, 8>(BYTE* p1, const BYTE* p2, const BYTE* mask,
+  const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f);
+template void overlay_blend_sse41_uint<true, uint16_t, 10>(BYTE* p1, const BYTE* p2, const BYTE* mask,
+  const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f);
+template void overlay_blend_sse41_uint<true, uint16_t, 12>(BYTE* p1, const BYTE* p2, const BYTE* mask,
+  const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f);
+template void overlay_blend_sse41_uint<true, uint16_t, 14>(BYTE* p1, const BYTE* p2, const BYTE* mask,
+  const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f);
+template void overlay_blend_sse41_uint<true, uint16_t, 16>(BYTE* p1, const BYTE* p2, const BYTE* mask,
+  const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f);
+//--
+template void overlay_blend_sse41_uint<false, uint8_t, 8>(BYTE* p1, const BYTE* p2, const BYTE* mask,
+  const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f);
+template void overlay_blend_sse41_uint<false, uint16_t, 10>(BYTE* p1, const BYTE* p2, const BYTE* mask,
+  const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f);
+template void overlay_blend_sse41_uint<false, uint16_t, 12>(BYTE* p1, const BYTE* p2, const BYTE* mask,
+  const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f);
+template void overlay_blend_sse41_uint<false, uint16_t, 14>(BYTE* p1, const BYTE* p2, const BYTE* mask,
+  const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f);
+template void overlay_blend_sse41_uint<false, uint16_t, 16>(BYTE* p1, const BYTE* p2, const BYTE* mask,
+  const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f);
+
+template<bool has_mask, typename pixel_t, int bits_per_pixel>
+#if defined(GCC) || defined(CLANG)
+__attribute__((__target__("sse2")))
+#endif
+void overlay_blend_sse2_uint(BYTE* p1, const BYTE* p2, const BYTE* mask,
+  const int p1_pitch, const int p2_pitch, const int mask_pitch,
+  const int width, const int height, const int opacity, const float opacity_f)
+{
+
+  auto rounder = _mm_set1_ps(0.5f);
+  const int max_pixel_value = (1 << bits_per_pixel) - 1;
+  auto factor = 1.0f / max_pixel_value;
+  factor = factor * opacity_f;
+  auto factor_v = _mm_set1_ps(factor);
+
+  const int realwidth = width * sizeof(pixel_t);
+
+  // 8 pixels at a time
+  constexpr int bytes_per_cycle = 8 * sizeof(pixel_t);
+  int wMod8 = (realwidth / bytes_per_cycle) * bytes_per_cycle;
+
+  auto zero = _mm_setzero_si128();
+
+  for (int y = 0; y < height; y++) {
+    for (int x = 0; x < wMod8; x += bytes_per_cycle) {
+      __m128 unpacked_p1, unpacked_p1_2;
+      __m128 unpacked_p2, unpacked_p2_2;
+      Eightpixels_to_Eightfloats_sse2<pixel_t>((const pixel_t*)(p1 + x), unpacked_p1, unpacked_p1_2, zero); // 8x32
+      Eightpixels_to_Eightfloats_sse2<pixel_t>((const pixel_t*)(p2 + x), unpacked_p2, unpacked_p2_2, zero); // 8x32
+
+      __m128 result, result_2;
+      if constexpr (has_mask) {
+        __m128 unpacked_mask, unpacked_mask_2;
+        Eightpixels_to_Eightfloats_sse2<pixel_t>((const pixel_t*)(mask + x), unpacked_mask, unpacked_mask_2, zero); // 8x32
+        unpacked_mask = _mm_mul_ps(unpacked_mask, factor_v);
+        unpacked_mask_2 = _mm_mul_ps(unpacked_mask_2, factor_v);
+        result = overlay_blend_sse_core_new(unpacked_p1, unpacked_p2, unpacked_mask);
+        result_2 = overlay_blend_sse_core_new(unpacked_p1_2, unpacked_p2_2, unpacked_mask_2);
+      }
+      else {
+        result = overlay_blend_sse_core_new(unpacked_p1, unpacked_p2, factor_v);
+        result_2 = overlay_blend_sse_core_new(unpacked_p1_2, unpacked_p2_2, factor_v);
+      }
+
+      Store_Eightpixels_sse2<pixel_t, bits_per_pixel>((pixel_t*)(p1 + x), result, result_2, rounder);
+    }
+
+    // Leftover value
+
+    for (int x = wMod8 / sizeof(pixel_t); x < width; x++) {
+      const float new_factor = has_mask ? static_cast<float>(reinterpret_cast<const pixel_t*>(mask)[x]) * factor : factor;
+      auto result = overlay_blend_c_core_simple(reinterpret_cast<pixel_t*>(p1)[x], reinterpret_cast<const pixel_t*>(p2)[x], new_factor);
+      reinterpret_cast<pixel_t*>(p1)[x] = (pixel_t)(result + 0.5f);
+    }
+
+    p1 += p1_pitch;
+    p2 += p2_pitch;
+    if (has_mask)
+      mask += mask_pitch;
+  }
+}
+
+// instantiate
+// mask yes/no
+template void overlay_blend_sse2_uint<true, uint8_t, 8>(BYTE* p1, const BYTE* p2, const BYTE* mask,
+  const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f);
+template void overlay_blend_sse2_uint<true, uint16_t, 10>(BYTE* p1, const BYTE* p2, const BYTE* mask,
+  const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f);
+template void overlay_blend_sse2_uint<true, uint16_t, 12>(BYTE* p1, const BYTE* p2, const BYTE* mask,
+  const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f);
+template void overlay_blend_sse2_uint<true, uint16_t, 14>(BYTE* p1, const BYTE* p2, const BYTE* mask,
+  const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f);
+template void overlay_blend_sse2_uint<true, uint16_t, 16>(BYTE* p1, const BYTE* p2, const BYTE* mask,
+  const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f);
+//--
+template void overlay_blend_sse2_uint<false, uint8_t, 8>(BYTE* p1, const BYTE* p2, const BYTE* mask,
+  const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f);
+template void overlay_blend_sse2_uint<false, uint16_t, 10>(BYTE* p1, const BYTE* p2, const BYTE* mask,
+  const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f);
+template void overlay_blend_sse2_uint<false, uint16_t, 12>(BYTE* p1, const BYTE* p2, const BYTE* mask,
+  const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f);
+template void overlay_blend_sse2_uint<false, uint16_t, 14>(BYTE* p1, const BYTE* p2, const BYTE* mask,
+  const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f);
+template void overlay_blend_sse2_uint<false, uint16_t, 16>(BYTE* p1, const BYTE* p2, const BYTE* mask,
+  const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f);
+
+
+
+template<bool has_mask>
+void overlay_blend_sse2_float(BYTE* p1, const BYTE* p2, const BYTE* mask,
   const int p1_pitch, const int p2_pitch, const int mask_pitch,
-  const int width, const int height) {
+  const int width, const int height, const int /*opacity*/, const float opacity_f)
+{
+
   const int realwidth = width * sizeof(float);
 
   int wMod16 = (realwidth / 16) * 16;
+  auto opacity_v = _mm_set1_ps(opacity_f);
 
   for (int y = 0; y < height; y++) {
     for (int x = 0; x < wMod16; x += 16) {
-      __m128i p1_f = _mm_loadu_si128(reinterpret_cast<const __m128i*>(p1 + x));
-      __m128i p2_f = _mm_loadu_si128(reinterpret_cast<const __m128i*>(p2 + x));
-      __m128i mask_f = _mm_loadu_si128(reinterpret_cast<const __m128i*>(mask + x));
-
-      __m128i result = overlay_blend_sse2_float_core(p1_f, p2_f, mask_f);
+      auto p1_f = _mm_loadu_ps(reinterpret_cast<const float*>(p1 + x));
+      auto p2_f = _mm_loadu_ps(reinterpret_cast<const float*>(p2 + x));
+      __m128 new_mask;
+      if constexpr (has_mask) {
+        new_mask = _mm_loadu_ps(reinterpret_cast<const float*>(mask + x));
+        new_mask = _mm_mul_ps(new_mask, opacity_v);
+      }
+      else {
+        new_mask = opacity_v;
+      }
+      auto result = _mm_add_ps(p1_f, _mm_mul_ps(_mm_sub_ps(p2_f, p1_f), new_mask)); // p1*(1-mask) + p2*mask = p1+(p2-p1)*mask
 
-      _mm_storeu_si128(reinterpret_cast<__m128i*>(p1 + x), result);
+      _mm_storeu_ps(reinterpret_cast<float*>(p1 + x), result);
     }
 
     // Leftover value
 
     for (int x = wMod16 / sizeof(float); x < width; x++) {
-      float result = overlay_blend_c_core_f(reinterpret_cast<float *>(p1)[x], reinterpret_cast<const float *>(p2)[x], reinterpret_cast<const float *>(mask)[x]);
-      reinterpret_cast<float *>(p1)[x] = result;
+      auto new_mask = has_mask ? reinterpret_cast<const float*>(mask)[x] * opacity_f : opacity_f;
+      auto p1x = reinterpret_cast<float*>(p1)[x];
+      auto p2x = reinterpret_cast<const float*>(p2)[x];
+      auto result = p1x + (p2x - p1x) * new_mask; // p1x*(1-new_mask) + p2x*mask
+      reinterpret_cast<float*>(p1)[x] = result;
     }
 
 
     p1 += p1_pitch;
     p2 += p2_pitch;
-    mask += mask_pitch;
+    if constexpr (has_mask)
+      mask += mask_pitch;
   }
 }
 
 // instantiate
-template void overlay_blend_sse41_plane_masked<uint8_t, 8>(BYTE *p1, const BYTE *p2, const BYTE *mask, const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height);
-// 16 bit: SSE4 only
-template void overlay_blend_sse41_plane_masked<uint16_t, 10>(BYTE *p1, const BYTE *p2, const BYTE *mask, const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height);
-template void overlay_blend_sse41_plane_masked<uint16_t, 12>(BYTE *p1, const BYTE *p2, const BYTE *mask, const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height);
-template void overlay_blend_sse41_plane_masked<uint16_t, 14>(BYTE *p1, const BYTE *p2, const BYTE *mask, const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height);
-template void overlay_blend_sse41_plane_masked<uint16_t, 16>(BYTE *p1, const BYTE *p2, const BYTE *mask, const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height);
+template void overlay_blend_sse2_float<false>(BYTE* p1, const BYTE* p2, const BYTE* mask,
+  const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int /*opacity*/, const float opacity_f);
+template void overlay_blend_sse2_float<true>(BYTE* p1, const BYTE* p2, const BYTE* mask,
+  const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int /*opacity*/, const float opacity_f);
 
 
 
@@ -487,8 +775,8 @@ void overlay_blend_mmx_plane_opacity(BYTE *p1, const BYTE *p2,
 }
 #endif
 
-void overlay_blend_sse2_plane_opacity(BYTE *p1, const BYTE *p2,
-  const int p1_pitch, const int p2_pitch,
+void overlay_blend_sse2_plane_opacity(BYTE *p1, const BYTE *p2, const BYTE* /*mask*/ ,
+  const int p1_pitch, const int p2_pitch, const int /*mask_pitch*/,
   const int width, const int height, const int opacity, const float opacity_f) {
 /*
   const int OPACITY_SHIFT  = 8; // opacity always max 0..256
@@ -559,8 +847,8 @@ template<int bits_per_pixel>
 #if defined(GCC) || defined(CLANG)
 __attribute__((__target__("sse4.1")))
 #endif
-void overlay_blend_sse41_plane_opacity_uint16(BYTE *p1, const BYTE *p2,
-  const int p1_pitch, const int p2_pitch,
+void overlay_blend_sse41_plane_opacity_uint16(BYTE *p1, const BYTE *p2, const BYTE* /*mask*/,
+  const int p1_pitch, const int p2_pitch, const int /*mask_pitch*/,
   const int width, const int height, const int opacity, const float opacity_f)
 {
   /*
@@ -644,72 +932,19 @@ void overlay_blend_sse41_plane_opacity_uint16(BYTE *p1, const BYTE *p2,
   }
 }
 
-void overlay_blend_sse2_plane_opacity_float(BYTE *p1, const BYTE *p2,
-  const int p1_pitch, const int p2_pitch,
-  const int width, const int height, const int opacity, const float opacity_f) {
-  /*
-    const int OPACITY_SHIFT  = 8; // opacity always max 0..256
-    const int MASK_CORR_SHIFT = OPACITY_SHIFT; // no mask, mask = opacity, 8 bits always
-    const int half_pixel_value_rounding = (1 << (MASK_CORR_SHIFT - 1));
-
-    // avoid "uint16*uint16 can't get into int32" overflows
-    // no need here, opacity as mask is always 8 bit
-    // typedef std::conditional < sizeof(pixel_t) == 1, int, typename std::conditional < sizeof(pixel_t) == 2, int64_t, float>::type >::type result_t;
-
-    for (int y = 0; y < height; y++) {
-      for (int x = 0; x < width; x++) {
-        pixel_t p1x = reinterpret_cast<pixel_t *>(p1)[x];
-        pixel_t p2x = reinterpret_cast<const pixel_t *>(p2)[x];
-        pixel_t result = (pixel_t)((((p1x << MASK_CORR_SHIFT) | half_pixel_value_rounding) + (p2x-p1x)*opacity) >> MASK_CORR_SHIFT);
-        //BYTE result = overlay_blend_c_core_8(p1[x], p2[x], opacity);
-        reinterpret_cast<pixel_t *>(p1)[x] = result;
-      }
-  */
-  AVS_UNUSED(opacity);
-
-  __m128i mask;
-  mask = _mm_castps_si128(_mm_set1_ps(opacity_f));
-  const int realwidth = width * sizeof(float);
-
-  int wMod16 = (realwidth / 16) * 16;
-
-  for (int y = 0; y < height; y++) {
-    for (int x = 0; x < wMod16; x += 16) {
-      __m128i p1_f, p2_f;
-
-      p1_f = _mm_loadu_si128(reinterpret_cast<const __m128i*>(p1 + x));
-      p2_f = _mm_loadu_si128(reinterpret_cast<const __m128i*>(p2 + x));
-
-      __m128i result;
-      // sizeof(pixel_t) == 4, float
-      result = overlay_blend_sse2_float_core(p1_f, p2_f, mask);
-
-      _mm_storeu_si128(reinterpret_cast<__m128i*>(p1 + x), result);
-    }
-
-    // Leftover value
-    for (int x = wMod16 / sizeof(float); x < width; x++) {
-      float result = overlay_blend_c_core_f(reinterpret_cast<float *>(p1)[x], reinterpret_cast<const float *>(p2)[x], opacity_f);
-      reinterpret_cast<float *>(p1)[x] = result;
-    }
-
-    p1 += p1_pitch;
-    p2 += p2_pitch;
-  }
-}
 
 // instantiate
-template void overlay_blend_sse41_plane_opacity_uint16<10>(BYTE *p1, const BYTE *p2,
-  const int p1_pitch, const int p2_pitch,
+template void overlay_blend_sse41_plane_opacity_uint16<10>(BYTE *p1, const BYTE *p2, const BYTE* mask,
+  const int p1_pitch, const int p2_pitch, const int mask_pitch,
   const int width, const int height, const int opacity, const float opacity_f);
-template void overlay_blend_sse41_plane_opacity_uint16<12>(BYTE *p1, const BYTE *p2,
-  const int p1_pitch, const int p2_pitch,
+template void overlay_blend_sse41_plane_opacity_uint16<12>(BYTE *p1, const BYTE *p2, const BYTE* mask,
+  const int p1_pitch, const int p2_pitch, const int mask_pitch,
   const int width, const int height, const int opacity, const float opacity_f);
-template void overlay_blend_sse41_plane_opacity_uint16<14>(BYTE *p1, const BYTE *p2,
-  const int p1_pitch, const int p2_pitch,
+template void overlay_blend_sse41_plane_opacity_uint16<14>(BYTE *p1, const BYTE *p2, const BYTE* mask,
+  const int p1_pitch, const int p2_pitch, const int mask_pitch,
   const int width, const int height, const int opacity, const float opacity_f);
-template void overlay_blend_sse41_plane_opacity_uint16<16>(BYTE *p1, const BYTE *p2,
-  const int p1_pitch, const int p2_pitch,
+template void overlay_blend_sse41_plane_opacity_uint16<16>(BYTE *p1, const BYTE *p2, const BYTE* mask,
+  const int p1_pitch, const int p2_pitch, const int mask_pitch,
   const int width, const int height, const int opacity, const float opacity_f);
 
 
@@ -954,47 +1189,6 @@ void overlay_blend_sse41_plane_masked_opacity(BYTE *p1, const BYTE *p2, const BY
   }
 }
 
-void overlay_blend_sse2_plane_masked_opacity_float(BYTE *p1, const BYTE *p2, const BYTE *mask,
-  const int p1_pitch, const int p2_pitch, const int mask_pitch,
-  const int width, const int height, const int opacity, const float opacity_f) {
-
-  AVS_UNUSED(opacity_f);
-
-  __m128i opacity_mask = _mm_castps_si128(_mm_set1_ps(opacity_f));
-  const int realwidth = width * sizeof(float);
-
-  int wMod16 = (realwidth / 16) * 16;
-
-  for (int y = 0; y < height; y++) {
-    for (int x = 0; x < wMod16; x += 16) {
-      __m128i p1_f, p2_f;
-      __m128i mask_f;
-
-      p1_f = _mm_loadu_si128(reinterpret_cast<const __m128i*>(p1 + x));
-      p2_f = _mm_loadu_si128(reinterpret_cast<const __m128i*>(p2 + x));
-      mask_f = _mm_loadu_si128(reinterpret_cast<const __m128i*>(mask + x));
-
-      mask_f = overlay_merge_mask_sse2_float(mask_f, opacity_mask);
-      __m128i result = overlay_blend_sse2_float_core(p1_f, p2_f, mask_f);
-
-      _mm_storeu_si128(reinterpret_cast<__m128i*>(p1 + x), result);
-    }
-
-    // Leftover value
-    for (int x = wMod16 / sizeof(float); x < width; x++) {
-      float new_mask = (reinterpret_cast<const float *>(mask)[x] * opacity_f);
-      float p1x = reinterpret_cast<float *>(p1)[x];
-      float p2x = reinterpret_cast<const float *>(p2)[x];
-
-      float result = p1x + (p2x - p1x)*new_mask;
-      reinterpret_cast<float *>(p1)[x] = result;
-    }
-
-    p1 += p1_pitch;
-    p2 += p2_pitch;
-    mask += mask_pitch;
-  }
-}
 
 // instantiate
 template void overlay_blend_sse41_plane_masked_opacity<uint8_t,8>(BYTE *p1, const BYTE *p2, const BYTE *mask, const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f);
diff --git a/avs_core/filters/overlay/intel/blend_common_sse.h b/avs_core/filters/overlay/intel/blend_common_sse.h
index 2b181f53d4..9952f737f7 100644
--- a/avs_core/filters/overlay/intel/blend_common_sse.h
+++ b/avs_core/filters/overlay/intel/blend_common_sse.h
@@ -43,9 +43,11 @@
 #ifdef X86_32
 void overlay_blend_mmx_plane_masked(BYTE* p1, const BYTE* p2, const BYTE* mask,
   const int p1_pitch, const int p2_pitch, const int mask_pitch,
-  const int width, const int height);
+  const int width, const int height, const int opacity, const float opacity_f);
 #endif
-void overlay_blend_sse2_plane_masked(BYTE* p1, const BYTE* p2, const BYTE* mask, const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height);
+void overlay_blend_sse2_plane_masked(BYTE* p1, const BYTE* p2, const BYTE* mask, 
+  const int p1_pitch, const int p2_pitch, const int mask_pitch, 
+  const int width, const int height, const int opacity, const float opacity_f);
 
 template<typename pixel_t, int bits_per_pixel>
 #if defined(GCC) || defined(CLANG)
@@ -53,31 +55,46 @@ __attribute__((__target__("sse4.1")))
 #endif
 void overlay_blend_sse41_plane_masked(BYTE* p1, const BYTE* p2, const BYTE* mask,
   const int p1_pitch, const int p2_pitch, const int mask_pitch,
-  const int width, const int height);
+  const int width, const int height, const int opacity, const float opacity_f);
+
+template<bool has_mask, typename pixel_t, int bits_per_pixel>
+#if defined(GCC) || defined(CLANG)
+__attribute__((__target__("sse4.1")))
+#endif
+void overlay_blend_sse41_uint(BYTE* p1, const BYTE* p2, const BYTE* mask,
+  const int p1_pitch, const int p2_pitch, const int mask_pitch,
+  const int width, const int height, const int opacity, const float opacity_f);
+
+template<bool has_mask, typename pixel_t, int bits_per_pixel>
+#if defined(GCC) || defined(CLANG)
+__attribute__((__target__("sse2")))
+#endif
+void overlay_blend_sse2_uint(BYTE* p1, const BYTE* p2, const BYTE* mask,
+  const int p1_pitch, const int p2_pitch, const int mask_pitch,
+  const int width, const int height, const int opacity, const float opacity_f);
 
-void overlay_blend_sse2_plane_masked_float(BYTE* p1, const BYTE* p2, const BYTE* mask, const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height);
+template<bool has_mask>
+void overlay_blend_sse2_float(BYTE* p1, const BYTE* p2, const BYTE* mask,
+  const int p1_pitch, const int p2_pitch, const int mask_pitch,
+  const int width, const int height, const int opacity, const float opacity_f);
 
 
 #ifdef X86_32
-void overlay_blend_mmx_plane_opacity(BYTE* p1, const BYTE* p2,
-  const int p1_pitch, const int p2_pitch,
+void overlay_blend_mmx_plane_opacity(BYTE* p1, const BYTE* p2, const BYTE* mask,
+  const int p1_pitch, const int p2_pitch, const int mask_pitch,
   const int width, const int height, const int opacity, const float opacity_f);
 #endif
 
-void overlay_blend_sse2_plane_opacity(BYTE* p1, const BYTE* p2,
-  const int p1_pitch, const int p2_pitch,
+void overlay_blend_sse2_plane_opacity(BYTE* p1, const BYTE* p2, const BYTE* mask,
+  const int p1_pitch, const int p2_pitch, const int mask_pitch,
   const int width, const int height, const int opacity, const float opacity_f);
 
 template<int bits_per_pixel>
 #if defined(GCC) || defined(CLANG)
 __attribute__((__target__("sse4.1")))
 #endif
-void overlay_blend_sse41_plane_opacity_uint16(BYTE* p1, const BYTE* p2,
-  const int p1_pitch, const int p2_pitch,
-  const int width, const int height, const int opacity, const float opacity_f);
-
-void overlay_blend_sse2_plane_opacity_float(BYTE* p1, const BYTE* p2,
-  const int p1_pitch, const int p2_pitch,
+void overlay_blend_sse41_plane_opacity_uint16(BYTE* p1, const BYTE* p2, const BYTE* mask,
+  const int p1_pitch, const int p2_pitch, const int mask_pitch,
   const int width, const int height, const int opacity, const float opacity_f);
 
 
@@ -87,7 +104,9 @@ void overlay_blend_mmx_plane_masked_opacity(BYTE* p1, const BYTE* p2, const BYTE
   const int width, const int height, const int opacity, const float opacity_f);
 #endif
 
-void overlay_blend_sse2_plane_masked_opacity(BYTE* p1, const BYTE* p2, const BYTE* mask, const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f);
+void overlay_blend_sse2_plane_masked_opacity(BYTE* p1, const BYTE* p2, const BYTE* mask, 
+  const int p1_pitch, const int p2_pitch, const int mask_pitch,
+  const int width, const int height, const int opacity, const float opacity_f);
 
 template<typename pixel_t, int bits_per_pixel>
 #if defined(GCC) || defined(CLANG)
@@ -97,8 +116,6 @@ void overlay_blend_sse41_plane_masked_opacity(BYTE* p1, const BYTE* p2, const BY
   const int p1_pitch, const int p2_pitch, const int mask_pitch,
   const int width, const int height, const int opacity, const float opacity_f);
 
-void overlay_blend_sse2_plane_masked_opacity_float(BYTE* p1, const BYTE* p2, const BYTE* mask, const int p1_pitch, const int p2_pitch, const int mask_pitch, const int width, const int height, const int opacity, const float opacity_f);
-
 #ifdef X86_32
 void overlay_darken_mmx(BYTE* p1Y, BYTE* p1U, BYTE* p1V, const BYTE* p2Y, const BYTE* p2U, const BYTE* p2V, int p1_pitch, int p2_pitch, int width, int height);
 void overlay_lighten_mmx(BYTE* p1Y, BYTE* p1U, BYTE* p1V, const BYTE* p2Y, const BYTE* p2U, const BYTE* p2V, int p1_pitch, int p2_pitch, int width, int height);
diff --git a/avs_core/filters/overlay/overlay.cpp b/avs_core/filters/overlay/overlay.cpp
index 451c25fd93..29623f84f3 100644
--- a/avs_core/filters/overlay/overlay.cpp
+++ b/avs_core/filters/overlay/overlay.cpp
@@ -719,6 +719,9 @@ void Overlay::SetOfModeByName(const char* name, IScriptEnvironment* env) {
   if (!lstrcmpi(name, "Blend")) {
     of_mode = OF_Blend;
   }
+  else if (!lstrcmpi(name, "Blend_Compat")) {
+    of_mode = OF_Blend_Compat;
+  }
   else if (!lstrcmpi(name, "Add")) {
     of_mode = OF_Add;
   }
@@ -758,7 +761,9 @@ void Overlay::SetOfModeByName(const char* name, IScriptEnvironment* env) {
 OverlayFunction* Overlay::SelectFunction()
 {
   switch (of_mode) {
-  case OF_Blend: return new OL_BlendImage();
+  case OF_Blend: 
+  case OF_Blend_Compat:
+    return new OL_BlendImage();
   case OF_Add: return new OL_AddImage();
   case OF_Subtract: return new OL_AddImage(); // common with Add    //return new OL_SubtractImage();
   case OF_Multiply: return new OL_MultiplyImage();
diff --git a/avs_core/filters/overlay/overlayfunctions.h b/avs_core/filters/overlay/overlayfunctions.h
index c461516d0e..2f258059c8 100644
--- a/avs_core/filters/overlay/overlayfunctions.h
+++ b/avs_core/filters/overlay/overlayfunctions.h
@@ -54,7 +54,8 @@ enum {
   OF_SoftLight,
   OF_HardLight,
   OF_Difference,
-  OF_Exclusion
+  OF_Exclusion,
+  OF_Blend_Compat
 };
 
 class OverlayFunction {