From 6e0b494191107f1e5b3afa6981825d314f800a50 Mon Sep 17 00:00:00 2001
From: Manuel Virgilio <real_virgil@yahoo.it>
Date: Tue, 14 Nov 2023 22:42:28 +0100
Subject: [PATCH 1/2] added cpu optimization detection and avx optimization

---
 Makefile                                     |  1 +
 common/optimization/avx.cpp                  | 72 ++++++++++++++++++++
 common/optimization/avx.hpp                  | 13 ++++
 common/optimization/default.cpp              | 26 +++++++
 common/optimization/default.hpp              | 14 ++++
 common/optimization/optimization.cpp         | 65 ++++++++++++++++++
 common/optimization/optimization.hpp         | 13 ++++
 plugins/dragonfly-early-reflections/DSP.cpp  | 16 ++---
 plugins/dragonfly-early-reflections/DSP.hpp  |  3 +
 plugins/dragonfly-early-reflections/Makefile |  7 +-
 plugins/dragonfly-hall-reverb/DSP.cpp        | 34 ++++-----
 plugins/dragonfly-hall-reverb/DSP.hpp        |  3 +
 plugins/dragonfly-hall-reverb/Makefile       |  7 +-
 plugins/dragonfly-plate-reverb/DSP.cpp       | 16 ++---
 plugins/dragonfly-plate-reverb/DSP.hpp       |  3 +
 plugins/dragonfly-plate-reverb/Makefile      |  7 +-
 plugins/dragonfly-room-reverb/DSP.cpp        | 34 ++++-----
 plugins/dragonfly-room-reverb/DSP.hpp        |  3 +
 plugins/dragonfly-room-reverb/Makefile       |  7 +-
 19 files changed, 286 insertions(+), 58 deletions(-)
 create mode 100644 common/optimization/avx.cpp
 create mode 100644 common/optimization/avx.hpp
 create mode 100644 common/optimization/default.cpp
 create mode 100644 common/optimization/default.hpp
 create mode 100644 common/optimization/optimization.cpp
 create mode 100644 common/optimization/optimization.hpp

diff --git a/Makefile b/Makefile
index 8694093..272c253 100644
--- a/Makefile
+++ b/Makefile
@@ -48,6 +48,7 @@ clean:
 	rm -f common/*.d common/*.o
 	rm -f common/freeverb/*.d common/freeverb/*.o
 	rm -f common/kiss_fft/*.d common/kiss_fft/*.o
+	rm -f common/optimization/*.d common/optimization/*.o
 	rm -f dpf/utils/lv2_ttl_generator.d
 
 # --------------------------------------------------------------
diff --git a/common/optimization/avx.cpp b/common/optimization/avx.cpp
new file mode 100644
index 0000000..539097d
--- /dev/null
+++ b/common/optimization/avx.cpp
@@ -0,0 +1,72 @@
+#include "avx.hpp"
+
+#include <immintrin.h>
+
+void VSUM32FLOAT_avx(const float* op1, const float* op2, float* result, int32_t length)
+{
+    float extracted[8];
+    int32_t processed = 0;
+    
+    while ( processed < length )
+    {
+        int processing = ((length - processed) % 8) + 1;
+
+        const __m256 mm_op1 = _mm256_loadu_ps(op1 + processed);
+        const __m256 mm_op2 = _mm256_loadu_ps(op2 + processed );
+        __m256 mm_res = _mm256_add_ps(mm_op1, mm_op2);
+        
+        _mm256_storeu_ps(extracted, mm_res);
+        for ( int i = 0 ; i < processing ; i++ )
+        {
+            result[processed + i] = extracted[i];
+        }
+
+        processed += processing;
+    }
+}
+
+void VMUL32FLOAT_avx(const float* op1, const float* op2, float* result, int32_t length)
+{
+    float extracted[8];
+    int32_t processed = 0;
+ 
+    while ( processed < length )
+    {
+        int processing = ((length - processed) % 8) + 1;
+        const __m256 mm_op1 = _mm256_loadu_ps(op1 + processed);
+        const __m256 mm_op2 = _mm256_loadu_ps(op2 + processed);
+        __m256 mm_res = _mm256_mul_ps(mm_op1, mm_op2);
+        
+        _mm256_storeu_ps(extracted, mm_res);
+        for ( int i = 0 ; i < processing ; i++ )
+        {
+            result[processed + i] = extracted[i];
+        }
+
+        processed += processing;
+    }
+}
+
+void VMUL32FLOAT_V_avx(const float value, const float* vec, float* result, int32_t length)
+{    
+    float extracted[8];
+    const __m256 mm_op1 = _mm256_set1_ps(value);
+        
+    int processed = 0;
+    while ( processed < length )
+    {
+        int processing = ((length - processed) % 8) + 1;
+
+        const __m256 mm_op2 = _mm256_loadu_ps(vec + processed);
+        __m256 mm_res = _mm256_mul_ps(mm_op1, mm_op2);
+      
+        _mm256_storeu_ps(extracted, mm_res);
+        for ( int i = 0 ; i < processing ; i++ )
+        {
+            result[processed + i] = extracted[i];
+        }
+
+        processed += processing;
+    }
+}
+
diff --git a/common/optimization/avx.hpp b/common/optimization/avx.hpp
new file mode 100644
index 0000000..6b0d1e3
--- /dev/null
+++ b/common/optimization/avx.hpp
@@ -0,0 +1,13 @@
+
+#ifndef OPTIMIZATION_AVX_H_INCLUDED
+#define OPTIMIZATION_AVX_H_INCLUDED
+
+#include <stdint.h>
+
+void VSUM32FLOAT_avx(const float* op1, const float* op2, float* result, int32_t length);
+
+void VMUL32FLOAT_avx(const float* op1, const float* op2, float* result, int32_t length);
+
+void VMUL32FLOAT_V_avx(const float value, const float* vec, float* result, int32_t length);
+
+#endif
\ No newline at end of file
diff --git a/common/optimization/default.cpp b/common/optimization/default.cpp
new file mode 100644
index 0000000..b12033f
--- /dev/null
+++ b/common/optimization/default.cpp
@@ -0,0 +1,26 @@
+#include "default.hpp"
+
+void VSUM32FLOAT_default(const float* op1, const float* op2, float* result, int32_t length)
+{
+    for ( int32_t i = 0 ; i < length ; i++ )
+    {
+        result[i] = op1[i] + op2[i];
+    }
+}
+
+void VMUL32FLOAT_default(const float* op1, const float* op2, float* result, int32_t length)
+{
+    for ( int32_t i = 0 ; i < length ; i++ )
+    {
+        result[i] = op1[i] * op2[i];
+    }
+}
+
+void VMUL32FLOAT_V_default(const float value, const float* vec, float* result, int32_t length)
+{
+    for ( int32_t i = 0 ; i < length ; i++ )
+    {
+        result[i] = value * vec[i];
+    }
+}
+
diff --git a/common/optimization/default.hpp b/common/optimization/default.hpp
new file mode 100644
index 0000000..ebdd7a8
--- /dev/null
+++ b/common/optimization/default.hpp
@@ -0,0 +1,14 @@
+
+#ifndef OPTIMIZATION_DEFAULT_H_INCLUDED
+#define OPTIMIZATION_DEFAULT_H_INCLUDED
+
+#include <stdint.h>
+
+void VSUM32FLOAT_default(const float* op1, const float* op2, float* result, int32_t length);
+
+void VMUL32FLOAT_default(const float* op1, const float* op2, float* result, int32_t length);
+
+void VMUL32FLOAT_V_default(const float value, const float* vec, float* result, int32_t length);
+
+
+#endif
\ No newline at end of file
diff --git a/common/optimization/optimization.cpp b/common/optimization/optimization.cpp
new file mode 100644
index 0000000..b6066f9
--- /dev/null
+++ b/common/optimization/optimization.cpp
@@ -0,0 +1,65 @@
+
+#include "optimization.hpp"
+#include <stdint.h>
+
+#include "default.hpp"
+#include "avx.hpp"
+
+typedef struct 
+{
+    void (*VSUM32FLOAT)(const float* op1, const float* op2, float* result, int32_t length);    
+    void (*VMUL32FLOAT)(const float* op1, const float* op2, float* result, int32_t length);    
+    void (*VMUL32FLOAT_V)(const float value, const float* vec, float* result, int32_t length);
+} _CpuOptimization;
+
+int8_t OptimizationInitialized = 0;
+_CpuOptimization CpuOptimization;
+
+void SetupOptimization()
+{
+    __builtin_cpu_init();
+    if ( __builtin_cpu_supports("avx") )
+    {
+        CpuOptimization.VSUM32FLOAT = VSUM32FLOAT_avx;
+        CpuOptimization.VMUL32FLOAT = VMUL32FLOAT_avx;
+        CpuOptimization.VMUL32FLOAT_V = VMUL32FLOAT_V_avx;
+    }
+    else
+    {
+        CpuOptimization.VSUM32FLOAT = VSUM32FLOAT_default;
+        CpuOptimization.VMUL32FLOAT = VMUL32FLOAT_default;
+        CpuOptimization.VMUL32FLOAT_V = VMUL32FLOAT_V_default;
+    }
+    OptimizationInitialized = 1;
+}
+
+
+void VSUM32FLOAT(const float* op1, const float* op2, float* result, int32_t length)
+{
+    if (!OptimizationInitialized)
+    {
+        SetupOptimization();
+    }
+
+    CpuOptimization.VSUM32FLOAT(op1,op2,result,length);
+}
+
+void VMUL32FLOAT(const float* op1, const float* op2, float* result, int32_t length)
+{
+    if (!OptimizationInitialized)
+    {
+        SetupOptimization();
+    }
+
+    CpuOptimization.VMUL32FLOAT(op1,op2,result,length);
+}
+
+void VMUL32FLOAT_V(const float value, const float* vec, float* result, int32_t length)
+{
+    if (!OptimizationInitialized)
+    {
+        SetupOptimization();
+    }
+
+    CpuOptimization.VMUL32FLOAT_V(value,vec,result,length);
+}
\ No newline at end of file
diff --git a/common/optimization/optimization.hpp b/common/optimization/optimization.hpp
new file mode 100644
index 0000000..a3f1a89
--- /dev/null
+++ b/common/optimization/optimization.hpp
@@ -0,0 +1,13 @@
+
+#ifndef OPTIMIZATION_H_INCLUDED
+#define OPTIMIZATION_H_INCLUDED
+
+#include <stdint.h>
+
+void VSUM32FLOAT(const float* op1, const float* op2, float* result, int32_t length);
+
+void VMUL32FLOAT(const float* op1, const float* op2, float* result, int32_t length);
+
+void VMUL32FLOAT_V(const float value, const float* vec, float* result, int32_t length);
+
+#endif
\ No newline at end of file
diff --git a/plugins/dragonfly-early-reflections/DSP.cpp b/plugins/dragonfly-early-reflections/DSP.cpp
index d1f3439..858caa4 100644
--- a/plugins/dragonfly-early-reflections/DSP.cpp
+++ b/plugins/dragonfly-early-reflections/DSP.cpp
@@ -19,6 +19,7 @@
 #include "DistrhoPlugin.hpp"
 #include "DistrhoPluginInfo.h"
 #include "extra/ScopedDenormalDisable.hpp"
+#include "optimization/optimization.hpp"
 
 #include "DSP.hpp"
 
@@ -91,16 +92,13 @@ void DragonflyReverbDSP::run(const float** inputs, float** outputs, uint32_t fra
         buffer_frames
     );
 
-    for (uint32_t i = 0; i < buffer_frames; i++) {
-      outputs[0][offset + i] =
-        dryLevel * inputs[0][offset + i]  +
-        wetLevel * output_buffer[0][i];
-
-      outputs[1][offset + i] =
-        dryLevel * inputs[1][offset + i]  +
-        wetLevel * output_buffer[1][i];
-    }
+    VMUL32FLOAT_V( dryLevel, &inputs[0][offset], dry_buffer, buffer_frames );
+    VMUL32FLOAT_V( wetLevel, output_buffer[0], wet_buffer, buffer_frames );
+    VSUM32FLOAT(dry_buffer, wet_buffer, &outputs[0][offset], buffer_frames );
 
+    VMUL32FLOAT_V( dryLevel, &inputs[1][offset], dry_buffer, buffer_frames );
+    VMUL32FLOAT_V( wetLevel, output_buffer[1], wet_buffer, buffer_frames );
+    VSUM32FLOAT(dry_buffer, wet_buffer, &outputs[1][offset], buffer_frames );
   }
 }
 
diff --git a/plugins/dragonfly-early-reflections/DSP.hpp b/plugins/dragonfly-early-reflections/DSP.hpp
index 6b43d74..447cf72 100644
--- a/plugins/dragonfly-early-reflections/DSP.hpp
+++ b/plugins/dragonfly-early-reflections/DSP.hpp
@@ -46,6 +46,9 @@ class DragonflyReverbDSP : public AbstractDSP {
   float input_buffer[2][BUFFER_SIZE];
   float output_buffer[2][BUFFER_SIZE];
 
+  float dry_buffer[BUFFER_SIZE];
+  float wet_buffer[BUFFER_SIZE];
+
   void setInputLPF(float freq);
   void setInputHPF(float freq);
 };
diff --git a/plugins/dragonfly-early-reflections/Makefile b/plugins/dragonfly-early-reflections/Makefile
index 5b28096..f0c1af3 100644
--- a/plugins/dragonfly-early-reflections/Makefile
+++ b/plugins/dragonfly-early-reflections/Makefile
@@ -14,7 +14,10 @@ NAME = DragonflyEarlyReflections
 
 FILES_COMMON  = DSP.cpp \
 	../../common/kiss_fft/kiss_fft.c \
-	../../common/kiss_fft/kiss_fftr.c
+	../../common/kiss_fft/kiss_fftr.c \
+	../../common/optimization/optimization.cpp \
+	../../common/optimization/default.cpp \
+	../../common/optimization/avx.cpp
 
 ifneq ($(SYSTEM_FREEVERB3),true)
 FILES_COMMON += \
@@ -58,7 +61,7 @@ include ../../dpf/Makefile.plugins.mk
 # --------------------------------------------------------------
 # Build dependencies
 
-BUILD_CXX_FLAGS += -I../../common -DLIBFV3_FLOAT
+BUILD_CXX_FLAGS += -I../../common -DLIBFV3_FLOAT -mavx
 ifeq ($(SYSTEM_FREEVERB3),true)
 BUILD_CXX_FLAGS += -DLIBSRATE1
 BUILD_CXX_FLAGS += $(shell $(PKG_CONFIG) --cflags freeverb3-3)
diff --git a/plugins/dragonfly-hall-reverb/DSP.cpp b/plugins/dragonfly-hall-reverb/DSP.cpp
index 3878b12..dcad8c1 100644
--- a/plugins/dragonfly-hall-reverb/DSP.cpp
+++ b/plugins/dragonfly-hall-reverb/DSP.cpp
@@ -18,6 +18,7 @@
 #include "DistrhoPlugin.hpp"
 #include "DistrhoPluginInfo.h"
 #include "extra/ScopedDenormalDisable.hpp"
+#include "optimization/optimization.hpp"
 
 #include "DSP.hpp"
 
@@ -116,10 +117,11 @@ void DragonflyReverbDSP::run(const float** inputs, float** outputs, uint32_t fra
       early_out_buffer[1],
       buffer_frames);
       
-    for (uint32_t i = 0; i < buffer_frames; i++) {
-      late_in_buffer[0][i] = early_send * early_out_buffer[0][i] + inputs[0][offset + i];
-      late_in_buffer[1][i] = early_send * early_out_buffer[1][i] + inputs[1][offset + i];
-    }
+    VMUL32FLOAT_V(early_send, early_out_buffer[0], early_buffer, buffer_frames);
+    VSUM32FLOAT(early_buffer, &inputs[0][offset], late_in_buffer[0], buffer_frames );
+
+    VMUL32FLOAT_V(early_send, early_out_buffer[1], early_buffer, buffer_frames);
+    VSUM32FLOAT(early_buffer, &inputs[1][offset], late_in_buffer[1], buffer_frames );
     
     late.processreplace(
       const_cast<float *>(late_in_buffer[0]),
@@ -128,23 +130,23 @@ void DragonflyReverbDSP::run(const float** inputs, float** outputs, uint32_t fra
       late_out_buffer[1],
       buffer_frames);
       
-    for (uint32_t i = 0; i < buffer_frames; i++) {
-      outputs[0][offset + i] = dryLevel   * inputs[0][offset + i];
-      outputs[1][offset + i] = dryLevel   * inputs[1][offset + i];
-    }
+    VMUL32FLOAT_V(dryLevel, &inputs[0][offset], &outputs[0][offset], buffer_frames);
+    VMUL32FLOAT_V(dryLevel, &inputs[1][offset], &outputs[1][offset], buffer_frames);
 
     if( earlyLevel > 0.0 ){
-      for (uint32_t i = 0; i < buffer_frames; i++) {
-        outputs[0][offset + i] += earlyLevel * early_out_buffer[0][i];
-        outputs[1][offset + i] += earlyLevel * early_out_buffer[1][i];
-      }
+      VMUL32FLOAT_V(earlyLevel, early_out_buffer[0], early_buffer, buffer_frames);
+      VSUM32FLOAT(&outputs[0][offset], early_buffer, &outputs[0][offset], buffer_frames);
+
+      VMUL32FLOAT_V(earlyLevel, early_out_buffer[1], early_buffer, buffer_frames);
+      VSUM32FLOAT(&outputs[1][offset], early_buffer, &outputs[1][offset], buffer_frames);
     }
     
     if( lateLevel > 0.0 ){
-      for (uint32_t i = 0; i < buffer_frames; i++) {
-        outputs[0][offset + i] += lateLevel  * late_out_buffer[0][i];
-        outputs[1][offset + i] += lateLevel  * late_out_buffer[1][i];
-      }
+      VMUL32FLOAT_V(lateLevel, late_out_buffer[0], late_buffer, buffer_frames);
+      VSUM32FLOAT(&outputs[0][offset], late_buffer, &outputs[0][offset], buffer_frames);
+
+      VMUL32FLOAT_V(lateLevel, late_out_buffer[1], late_buffer, buffer_frames);
+      VSUM32FLOAT(&outputs[1][offset], late_buffer, &outputs[1][offset], buffer_frames);
     }
   }
 }
diff --git a/plugins/dragonfly-hall-reverb/DSP.hpp b/plugins/dragonfly-hall-reverb/DSP.hpp
index 1271c6e..72102ab 100644
--- a/plugins/dragonfly-hall-reverb/DSP.hpp
+++ b/plugins/dragonfly-hall-reverb/DSP.hpp
@@ -47,6 +47,9 @@ class DragonflyReverbDSP : public AbstractDSP {
   float early_out_buffer[2][BUFFER_SIZE];
   float late_in_buffer[2][BUFFER_SIZE];
   float late_out_buffer[2][BUFFER_SIZE];
+
+  float early_buffer[BUFFER_SIZE];
+  float late_buffer[BUFFER_SIZE];
 };
 
 #endif
diff --git a/plugins/dragonfly-hall-reverb/Makefile b/plugins/dragonfly-hall-reverb/Makefile
index 8c5c246..9170538 100644
--- a/plugins/dragonfly-hall-reverb/Makefile
+++ b/plugins/dragonfly-hall-reverb/Makefile
@@ -14,7 +14,10 @@ NAME = DragonflyHallReverb
 
 FILES_COMMON  = DSP.cpp \
 	../../common/kiss_fft/kiss_fft.c \
-	../../common/kiss_fft/kiss_fftr.c
+	../../common/kiss_fft/kiss_fftr.c \
+	../../common/optimization/optimization.cpp \
+	../../common/optimization/default.cpp \
+	../../common/optimization/avx.cpp
 
 ifneq ($(SYSTEM_FREEVERB3),true)
 FILES_COMMON += \
@@ -59,7 +62,7 @@ include ../../dpf/Makefile.plugins.mk
 # --------------------------------------------------------------
 # Build dependencies
 
-BUILD_CXX_FLAGS += -I../../common -DLIBFV3_FLOAT
+BUILD_CXX_FLAGS += -I../../common -DLIBFV3_FLOAT -mavx
 ifeq ($(SYSTEM_FREEVERB3),true)
 BUILD_CXX_FLAGS += -DLIBSRATE1
 BUILD_CXX_FLAGS += $(shell $(PKG_CONFIG) --cflags freeverb3-3)
diff --git a/plugins/dragonfly-plate-reverb/DSP.cpp b/plugins/dragonfly-plate-reverb/DSP.cpp
index 375cd39..3a32418 100644
--- a/plugins/dragonfly-plate-reverb/DSP.cpp
+++ b/plugins/dragonfly-plate-reverb/DSP.cpp
@@ -19,6 +19,7 @@
 #include "DistrhoPlugin.hpp"
 #include "DistrhoPluginInfo.h"
 #include "extra/ScopedDenormalDisable.hpp"
+#include "optimization/optimization.hpp"
 
 #include "DSP.hpp"
 
@@ -245,16 +246,13 @@ void DragonflyReverbDSP::run(const float** inputs, float** outputs, uint32_t fra
         buffer_frames
     );
 
-    for (uint32_t i = 0; i < buffer_frames; i++) {
-      outputs[0][offset + i] =
-        dryLevel   * inputs[0][offset + i]  +
-        wetLevel   * output_buffer[0][i];
-
-      outputs[1][offset + i] =
-        dryLevel   * inputs[1][offset + i]  +
-        wetLevel   * output_buffer[1][i];
-    }
+    VMUL32FLOAT_V( dryLevel, &inputs[0][offset], dry_buffer, buffer_frames);
+    VMUL32FLOAT_V( wetLevel, output_buffer[0], wet_buffer, buffer_frames);
+    VSUM32FLOAT(dry_buffer, wet_buffer, &outputs[0][offset], buffer_frames);
 
+    VMUL32FLOAT_V( dryLevel, &inputs[1][offset], dry_buffer, buffer_frames);
+    VMUL32FLOAT_V( wetLevel, output_buffer[1], wet_buffer, buffer_frames);
+    VSUM32FLOAT(dry_buffer, wet_buffer, &outputs[1][offset], buffer_frames);
   }
 }
 
diff --git a/plugins/dragonfly-plate-reverb/DSP.hpp b/plugins/dragonfly-plate-reverb/DSP.hpp
index 8907af0..da40d82 100644
--- a/plugins/dragonfly-plate-reverb/DSP.hpp
+++ b/plugins/dragonfly-plate-reverb/DSP.hpp
@@ -79,6 +79,9 @@ class DragonflyReverbDSP : public AbstractDSP {
   float filtered_input_buffer[2][BUFFER_SIZE];
   float output_buffer[2][BUFFER_SIZE];
 
+  float dry_buffer[BUFFER_SIZE];
+  float wet_buffer[BUFFER_SIZE];
+
   void setInputLPF(float freq);
   void setInputHPF(float freq);
 };
diff --git a/plugins/dragonfly-plate-reverb/Makefile b/plugins/dragonfly-plate-reverb/Makefile
index 9b12f9c..bddfe0e 100644
--- a/plugins/dragonfly-plate-reverb/Makefile
+++ b/plugins/dragonfly-plate-reverb/Makefile
@@ -14,7 +14,10 @@ NAME = DragonflyPlateReverb
 
 FILES_COMMON  = DSP.cpp \
 	../../common/kiss_fft/kiss_fft.c \
-	../../common/kiss_fft/kiss_fftr.c
+	../../common/kiss_fft/kiss_fftr.c \
+	../../common/optimization/optimization.cpp \
+	../../common/optimization/default.cpp \
+	../../common/optimization/avx.cpp
 
 ifneq ($(SYSTEM_FREEVERB3),true)
 FILES_COMMON += \
@@ -59,7 +62,7 @@ include ../../dpf/Makefile.plugins.mk
 # --------------------------------------------------------------
 # Build dependencies
 
-BUILD_CXX_FLAGS += -I../../common -DLIBFV3_FLOAT
+BUILD_CXX_FLAGS += -I../../common -DLIBFV3_FLOAT -mavx
 ifeq ($(SYSTEM_FREEVERB3),true)
 BUILD_CXX_FLAGS += -DLIBSRATE1
 BUILD_CXX_FLAGS += $(shell $(PKG_CONFIG) --cflags freeverb3-3)
diff --git a/plugins/dragonfly-room-reverb/DSP.cpp b/plugins/dragonfly-room-reverb/DSP.cpp
index c99e0b6..8c39e5e 100644
--- a/plugins/dragonfly-room-reverb/DSP.cpp
+++ b/plugins/dragonfly-room-reverb/DSP.cpp
@@ -19,6 +19,7 @@
 #include "DistrhoPlugin.hpp"
 #include "DistrhoPluginInfo.h"
 #include "extra/ScopedDenormalDisable.hpp"
+#include "optimization/optimization.hpp"
 
 #include "DSP.hpp"
 
@@ -130,10 +131,11 @@ void DragonflyReverbDSP::run(const float** inputs, float** outputs, uint32_t fra
       early_out_buffer[1],
       buffer_frames);
     
-    for (uint32_t i = 0; i < buffer_frames; i++) {
-      late_in_buffer[0][i] = early_send * early_out_buffer[0][i] + filtered_input_buffer[0][i];
-      late_in_buffer[1][i] = early_send * early_out_buffer[1][i] + filtered_input_buffer[1][i];
-    }
+    VMUL32FLOAT_V(early_send, early_out_buffer[0], late_buffer, buffer_frames);
+    VSUM32FLOAT(late_buffer, filtered_input_buffer[0], late_in_buffer[0], buffer_frames);
+
+    VMUL32FLOAT_V(early_send, early_out_buffer[1], late_buffer, buffer_frames);
+    VSUM32FLOAT(late_buffer, filtered_input_buffer[1], late_in_buffer[1], buffer_frames);
     
     late.processreplace(
       const_cast<float *>(late_in_buffer[0]),
@@ -142,23 +144,23 @@ void DragonflyReverbDSP::run(const float** inputs, float** outputs, uint32_t fra
       late_out_buffer[1],
       buffer_frames);
 
-    for (uint32_t i = 0; i < buffer_frames; i++) {
-      outputs[0][offset + i] = dryLevel   * inputs[0][offset + i];
-      outputs[1][offset + i] = dryLevel   * inputs[1][offset + i];
-    }
+    VMUL32FLOAT_V(dryLevel, &inputs[0][offset], &outputs[0][offset], buffer_frames);
+    VMUL32FLOAT_V(dryLevel, &inputs[1][offset], &outputs[1][offset], buffer_frames);
     
     if( earlyLevel > 0.0 ){
-      for (uint32_t i = 0; i < buffer_frames; i++) {
-        outputs[0][offset + i] += earlyLevel * early_out_buffer[0][i];
-        outputs[1][offset + i] += earlyLevel * early_out_buffer[1][i];
-      }
+      VMUL32FLOAT_V(earlyLevel, early_out_buffer[0], early_buffer, buffer_frames);
+      VSUM32FLOAT(early_buffer, &outputs[0][offset], &outputs[0][offset], buffer_frames);
+
+      VMUL32FLOAT_V(earlyLevel, early_out_buffer[1], early_buffer, buffer_frames);
+      VSUM32FLOAT(early_buffer, &outputs[1][offset], &outputs[1][offset], buffer_frames);
     }
     
     if( lateLevel > 0.0 ){
-      for (uint32_t i = 0; i < buffer_frames; i++) {
-        outputs[0][offset + i] += lateLevel  * late_out_buffer[0][i];
-        outputs[1][offset + i] += lateLevel  * late_out_buffer[1][i];
-      }
+      VMUL32FLOAT_V(lateLevel, late_out_buffer[0], late_buffer, buffer_frames);
+      VSUM32FLOAT(late_buffer, &outputs[0][offset], &outputs[0][offset], buffer_frames);
+
+      VMUL32FLOAT_V(lateLevel, late_out_buffer[1], late_buffer, buffer_frames);
+      VSUM32FLOAT(late_buffer, &outputs[1][offset], &outputs[1][offset], buffer_frames);
     }
   }
 }
diff --git a/plugins/dragonfly-room-reverb/DSP.hpp b/plugins/dragonfly-room-reverb/DSP.hpp
index b1a7aff..92fe2e2 100644
--- a/plugins/dragonfly-room-reverb/DSP.hpp
+++ b/plugins/dragonfly-room-reverb/DSP.hpp
@@ -52,6 +52,9 @@ class DragonflyReverbDSP : public AbstractDSP {
   float late_in_buffer[2][BUFFER_SIZE];
   float late_out_buffer[2][BUFFER_SIZE];
 
+  float early_buffer[BUFFER_SIZE];
+  float late_buffer[BUFFER_SIZE];
+
   void setInputLPF(float freq);
   void setInputHPF(float freq);
 };
diff --git a/plugins/dragonfly-room-reverb/Makefile b/plugins/dragonfly-room-reverb/Makefile
index 0fd6537..24f0631 100644
--- a/plugins/dragonfly-room-reverb/Makefile
+++ b/plugins/dragonfly-room-reverb/Makefile
@@ -14,7 +14,10 @@ NAME = DragonflyRoomReverb
 
 FILES_COMMON  = DSP.cpp \
 	../../common/kiss_fft/kiss_fft.c \
-	../../common/kiss_fft/kiss_fftr.c
+	../../common/kiss_fft/kiss_fftr.c \
+	../../common/optimization/optimization.cpp \
+	../../common/optimization/default.cpp \
+	../../common/optimization/avx.cpp
 
 ifneq ($(SYSTEM_FREEVERB3),true)
 FILES_COMMON += \
@@ -59,7 +62,7 @@ include ../../dpf/Makefile.plugins.mk
 # --------------------------------------------------------------
 # Build dependencies
 
-BUILD_CXX_FLAGS += -I../../common -DLIBFV3_FLOAT
+BUILD_CXX_FLAGS += -I../../common -DLIBFV3_FLOAT -mavx
 ifeq ($(SYSTEM_FREEVERB3),true)
 BUILD_CXX_FLAGS += -DLIBSRATE1
 BUILD_CXX_FLAGS += $(shell $(PKG_CONFIG) --cflags freeverb3-3)

From edea88ad1f659ebc74b11ac632b67f9a5591a1a7 Mon Sep 17 00:00:00 2001
From: Manuel Virgilio <real_virgil@yahoo.it>
Date: Wed, 15 Nov 2023 19:43:53 +0100
Subject: [PATCH 2/2] added macro to enable SIMD code, insertion of makefile to
 handle the optimization files inclusion

---
 common/optimization/Makefile.optimization.mk | 41 ++++++++++++
 common/optimization/avx.cpp                  | 68 +++++++++++++++++---
 common/optimization/avx.hpp                  |  4 +-
 common/optimization/default.cpp              | 10 ++-
 common/optimization/default.hpp              |  3 +-
 common/optimization/optimization.cpp         | 25 +++++--
 common/optimization/optimization.hpp         |  4 +-
 plugins/dragonfly-early-reflections/DSP.cpp  | 19 +++++-
 plugins/dragonfly-early-reflections/DSP.hpp  |  2 +
 plugins/dragonfly-early-reflections/Makefile | 12 ++--
 plugins/dragonfly-hall-reverb/DSP.cpp        | 55 +++++++++++-----
 plugins/dragonfly-hall-reverb/DSP.hpp        |  3 -
 plugins/dragonfly-hall-reverb/Makefile       | 12 ++--
 plugins/dragonfly-plate-reverb/DSP.cpp       | 17 ++++-
 plugins/dragonfly-plate-reverb/DSP.hpp       |  2 +
 plugins/dragonfly-plate-reverb/Makefile      | 12 ++--
 plugins/dragonfly-room-reverb/DSP.cpp        | 51 ++++++++++-----
 plugins/dragonfly-room-reverb/DSP.hpp        |  3 -
 plugins/dragonfly-room-reverb/Makefile       | 12 ++--
 19 files changed, 276 insertions(+), 79 deletions(-)
 create mode 100644 common/optimization/Makefile.optimization.mk

diff --git a/common/optimization/Makefile.optimization.mk b/common/optimization/Makefile.optimization.mk
new file mode 100644
index 0000000..99369f6
--- /dev/null
+++ b/common/optimization/Makefile.optimization.mk
@@ -0,0 +1,41 @@
+
+ifeq ($(OS),Windows_NT)
+    MACHINE = WIN32
+    ifeq ($(PROCESSOR_ARCHITECTURE),AMD64)
+    	ARCH = AMD64
+    endif
+    ifeq ($(PROCESSOR_ARCHITECTURE),x86)
+        ARCH = IA32
+    endif
+else
+    UNAME_M := $(shell uname -m)
+    ifeq ($(UNAME_M),x86_64)
+        ARCH = AMD64
+    endif
+    ifneq ($(filter %86,$(UNAME_M)),)
+        ARCH += IA32
+    endif
+    ifneq ($(filter arm%,$(UNAME_M)),)
+        ARCH += ARM
+    endif
+endif
+
+ifeq ($(ARCH), AMD64)
+USE_PLUGIN_SIMD = true
+CXXFLAGS += -mavx -mfma -DUSE_PLUGIN_SIMD
+endif
+
+# --------------------------
+# FIle Inclusion
+ifeq ($(USE_PLUGIN_SIMD), true)
+FILES_COMMON  += \
+	../../common/optimization/optimization.cpp \
+	../../common/optimization/default.cpp
+
+ifeq ($(ARCH), AMD64)
+FILES_COMMON  += \
+	../../common/optimization/avx.cpp
+endif
+
+endif
+
diff --git a/common/optimization/avx.cpp b/common/optimization/avx.cpp
index 539097d..7d7480c 100644
--- a/common/optimization/avx.cpp
+++ b/common/optimization/avx.cpp
@@ -2,21 +2,38 @@
 
 #include <immintrin.h>
 
-void VSUM32FLOAT_avx(const float* op1, const float* op2, float* result, int32_t length)
+int8_t AVXInitialized = 0;
+int8_t FMASupported = 0;
+
+void CheckFMASupport()
+{
+    if ( !AVXInitialized )
+    {
+        __builtin_cpu_init();
+        if ( __builtin_cpu_supports("fma") )
+        {
+            FMASupported = 1;
+        }
+
+        AVXInitialized = 1;
+    }
+}
+
+void VADD32FLOAT_avx(const float* op1, const float* op2, float* result, int32_t length)
 {
     float extracted[8];
     int32_t processed = 0;
     
     while ( processed < length )
     {
-        int processing = ((length - processed) % 8) + 1;
+        int32_t processing = ((length - processed) % 8) + 1;
 
         const __m256 mm_op1 = _mm256_loadu_ps(op1 + processed);
         const __m256 mm_op2 = _mm256_loadu_ps(op2 + processed );
         __m256 mm_res = _mm256_add_ps(mm_op1, mm_op2);
         
         _mm256_storeu_ps(extracted, mm_res);
-        for ( int i = 0 ; i < processing ; i++ )
+        for ( int32_t i = 0 ; i < processing ; i++ )
         {
             result[processed + i] = extracted[i];
         }
@@ -32,13 +49,13 @@ void VMUL32FLOAT_avx(const float* op1, const float* op2, float* result, int32_t
  
     while ( processed < length )
     {
-        int processing = ((length - processed) % 8) + 1;
+        int32_t processing = ((length - processed) % 8) + 1;
         const __m256 mm_op1 = _mm256_loadu_ps(op1 + processed);
         const __m256 mm_op2 = _mm256_loadu_ps(op2 + processed);
         __m256 mm_res = _mm256_mul_ps(mm_op1, mm_op2);
         
         _mm256_storeu_ps(extracted, mm_res);
-        for ( int i = 0 ; i < processing ; i++ )
+        for ( int32_t i = 0 ; i < processing ; i++ )
         {
             result[processed + i] = extracted[i];
         }
@@ -52,16 +69,16 @@ void VMUL32FLOAT_V_avx(const float value, const float* vec, float* result, int32
     float extracted[8];
     const __m256 mm_op1 = _mm256_set1_ps(value);
         
-    int processed = 0;
+    int32_t processed = 0;
     while ( processed < length )
     {
-        int processing = ((length - processed) % 8) + 1;
+        int32_t processing = ((length - processed) % 8) + 1;
 
         const __m256 mm_op2 = _mm256_loadu_ps(vec + processed);
         __m256 mm_res = _mm256_mul_ps(mm_op1, mm_op2);
       
         _mm256_storeu_ps(extracted, mm_res);
-        for ( int i = 0 ; i < processing ; i++ )
+        for ( int32_t i = 0 ; i < processing ; i++ )
         {
             result[processed + i] = extracted[i];
         }
@@ -70,3 +87,38 @@ void VMUL32FLOAT_V_avx(const float value, const float* vec, float* result, int32
     }
 }
 
+void VMADD32FLOAT_V_avx(const float value, const float* vmul, const float* vadd, float* result, int32_t length)
+{
+    float extracted[8];
+    const __m256 mm_val = _mm256_set1_ps(value);
+
+    CheckFMASupport();
+        
+    int32_t processed = 0;
+    while ( processed < length )
+    {
+        int32_t processing = ((length - processed) % 8) + 1;
+
+        const __m256 mm_mul = _mm256_loadu_ps(vmul + processed);
+        const __m256 mm_add = _mm256_loadu_ps(vadd + processed);
+
+        __m256 mm_res;
+        if ( FMASupported )
+        {
+            mm_res = _mm256_fmadd_ps(mm_val, mm_mul, mm_add);
+        }
+        else
+        {
+            mm_res = _mm256_mul_ps(mm_val, mm_mul);
+            mm_res = _mm256_add_ps(mm_res, mm_add);
+        }
+      
+        _mm256_storeu_ps(extracted, mm_res);
+        for ( int32_t i = 0 ; i < processing ; i++ )
+        {
+            result[processed + i] = extracted[i];
+        }
+
+        processed += processing;
+    }
+}
diff --git a/common/optimization/avx.hpp b/common/optimization/avx.hpp
index 6b0d1e3..6716f4b 100644
--- a/common/optimization/avx.hpp
+++ b/common/optimization/avx.hpp
@@ -4,10 +4,12 @@
 
 #include <stdint.h>
 
-void VSUM32FLOAT_avx(const float* op1, const float* op2, float* result, int32_t length);
+void VADD32FLOAT_avx(const float* op1, const float* op2, float* result, int32_t length);
 
 void VMUL32FLOAT_avx(const float* op1, const float* op2, float* result, int32_t length);
 
 void VMUL32FLOAT_V_avx(const float value, const float* vec, float* result, int32_t length);
 
+void VMADD32FLOAT_V_avx(const float value, const float* vmul, const float* vadd, float* result, int32_t length);
+
 #endif
\ No newline at end of file
diff --git a/common/optimization/default.cpp b/common/optimization/default.cpp
index b12033f..dc946a2 100644
--- a/common/optimization/default.cpp
+++ b/common/optimization/default.cpp
@@ -1,6 +1,6 @@
 #include "default.hpp"
 
-void VSUM32FLOAT_default(const float* op1, const float* op2, float* result, int32_t length)
+void VADD32FLOAT_default(const float* op1, const float* op2, float* result, int32_t length)
 {
     for ( int32_t i = 0 ; i < length ; i++ )
     {
@@ -24,3 +24,11 @@ void VMUL32FLOAT_V_default(const float value, const float* vec, float* result, i
     }
 }
 
+void VMADD32FLOAT_V_default(const float value, const float* vmul, const float* vadd, float* result, int32_t length)
+{
+    for ( int32_t i = 0 ; i < length ; i++ )
+    {
+        result[i] = (value * vmul[i]) + vadd[i];
+    }
+}
+
diff --git a/common/optimization/default.hpp b/common/optimization/default.hpp
index ebdd7a8..a351371 100644
--- a/common/optimization/default.hpp
+++ b/common/optimization/default.hpp
@@ -4,11 +4,12 @@
 
 #include <stdint.h>
 
-void VSUM32FLOAT_default(const float* op1, const float* op2, float* result, int32_t length);
+void VADD32FLOAT_default(const float* op1, const float* op2, float* result, int32_t length);
 
 void VMUL32FLOAT_default(const float* op1, const float* op2, float* result, int32_t length);
 
 void VMUL32FLOAT_V_default(const float value, const float* vec, float* result, int32_t length);
 
+void VMADD32FLOAT_V_default(const float value, const float* vmul, const float* vadd, float* result, int32_t length);
 
 #endif
\ No newline at end of file
diff --git a/common/optimization/optimization.cpp b/common/optimization/optimization.cpp
index b6066f9..a6f8bd4 100644
--- a/common/optimization/optimization.cpp
+++ b/common/optimization/optimization.cpp
@@ -7,9 +7,10 @@
 
 typedef struct 
 {
-    void (*VSUM32FLOAT)(const float* op1, const float* op2, float* result, int32_t length);    
+    void (*VADD32FLOAT)(const float* op1, const float* op2, float* result, int32_t length);    
     void (*VMUL32FLOAT)(const float* op1, const float* op2, float* result, int32_t length);    
     void (*VMUL32FLOAT_V)(const float value, const float* vec, float* result, int32_t length);
+    void (*VMADD32FLOAT_V)(const float value, const float* vmul, const float* vadd, float* result, int32_t length);
 } _CpuOptimization;
 
 int8_t OptimizationInitialized = 0;
@@ -18,30 +19,34 @@ _CpuOptimization CpuOptimization;
 void SetupOptimization()
 {
     __builtin_cpu_init();
+#if defined(__x86_64__) 
     if ( __builtin_cpu_supports("avx") )
     {
-        CpuOptimization.VSUM32FLOAT = VSUM32FLOAT_avx;
+        CpuOptimization.VADD32FLOAT = VADD32FLOAT_avx;
         CpuOptimization.VMUL32FLOAT = VMUL32FLOAT_avx;
         CpuOptimization.VMUL32FLOAT_V = VMUL32FLOAT_V_avx;
+        CpuOptimization.VMADD32FLOAT_V = VMADD32FLOAT_V_avx;
     }
     else
+#endif
     {
-        CpuOptimization.VSUM32FLOAT = VSUM32FLOAT_default;
+        CpuOptimization.VADD32FLOAT = VADD32FLOAT_default;
         CpuOptimization.VMUL32FLOAT = VMUL32FLOAT_default;
         CpuOptimization.VMUL32FLOAT_V = VMUL32FLOAT_V_default;
+        CpuOptimization.VMADD32FLOAT_V = VMADD32FLOAT_V_default;
     }
     OptimizationInitialized = 1;
 }
 
 
-void VSUM32FLOAT(const float* op1, const float* op2, float* result, int32_t length)
+void VADD32FLOAT(const float* op1, const float* op2, float* result, int32_t length)
 {
     if (!OptimizationInitialized)
     {
         SetupOptimization();
     }
 
-    CpuOptimization.VSUM32FLOAT(op1,op2,result,length);
+    CpuOptimization.VADD32FLOAT(op1,op2,result,length);
 }
 
 void VMUL32FLOAT(const float* op1, const float* op2, float* result, int32_t length)
@@ -62,4 +67,14 @@ void VMUL32FLOAT_V(const float value, const float* vec, float* result, int32_t l
     }
 
     CpuOptimization.VMUL32FLOAT_V(value,vec,result,length);
+}
+
+void VMADD32FLOAT_V(const float value, const float* vmul, const float* vadd, float* result, int32_t length)
+{
+    if (!OptimizationInitialized)
+    {
+        SetupOptimization();
+    }
+
+    CpuOptimization.VMADD32FLOAT_V(value,vmul,vadd,result,length);
 }
\ No newline at end of file
diff --git a/common/optimization/optimization.hpp b/common/optimization/optimization.hpp
index a3f1a89..47e4d99 100644
--- a/common/optimization/optimization.hpp
+++ b/common/optimization/optimization.hpp
@@ -4,10 +4,12 @@
 
 #include <stdint.h>
 
-void VSUM32FLOAT(const float* op1, const float* op2, float* result, int32_t length);
+void VADD32FLOAT(const float* op1, const float* op2, float* result, int32_t length);
 
 void VMUL32FLOAT(const float* op1, const float* op2, float* result, int32_t length);
 
 void VMUL32FLOAT_V(const float value, const float* vec, float* result, int32_t length);
 
+void VMADD32FLOAT_V(const float value, const float* vmul, const float* vadd, float* result, int32_t length);
+
 #endif
\ No newline at end of file
diff --git a/plugins/dragonfly-early-reflections/DSP.cpp b/plugins/dragonfly-early-reflections/DSP.cpp
index 858caa4..3288bb6 100644
--- a/plugins/dragonfly-early-reflections/DSP.cpp
+++ b/plugins/dragonfly-early-reflections/DSP.cpp
@@ -19,7 +19,10 @@
 #include "DistrhoPlugin.hpp"
 #include "DistrhoPluginInfo.h"
 #include "extra/ScopedDenormalDisable.hpp"
+
+#ifdef USE_PLUGIN_SIMD
 #include "optimization/optimization.hpp"
+#endif
 
 #include "DSP.hpp"
 
@@ -92,13 +95,25 @@ void DragonflyReverbDSP::run(const float** inputs, float** outputs, uint32_t fra
         buffer_frames
     );
 
+#ifdef USE_PLUGIN_SIMD
     VMUL32FLOAT_V( dryLevel, &inputs[0][offset], dry_buffer, buffer_frames );
     VMUL32FLOAT_V( wetLevel, output_buffer[0], wet_buffer, buffer_frames );
-    VSUM32FLOAT(dry_buffer, wet_buffer, &outputs[0][offset], buffer_frames );
+    VADD32FLOAT(dry_buffer, wet_buffer, &outputs[0][offset], buffer_frames );
 
     VMUL32FLOAT_V( dryLevel, &inputs[1][offset], dry_buffer, buffer_frames );
     VMUL32FLOAT_V( wetLevel, output_buffer[1], wet_buffer, buffer_frames );
-    VSUM32FLOAT(dry_buffer, wet_buffer, &outputs[1][offset], buffer_frames );
+    VADD32FLOAT(dry_buffer, wet_buffer, &outputs[1][offset], buffer_frames );
+#else
+    for (uint32_t i = 0; i < buffer_frames; i++) {
+      outputs[0][offset + i] =
+        dryLevel * inputs[0][offset + i]  +
+        wetLevel * output_buffer[0][i];
+
+      outputs[1][offset + i] =
+        dryLevel * inputs[1][offset + i]  +
+        wetLevel * output_buffer[1][i];
+    }
+#endif
   }
 }
 
diff --git a/plugins/dragonfly-early-reflections/DSP.hpp b/plugins/dragonfly-early-reflections/DSP.hpp
index 447cf72..99a25c3 100644
--- a/plugins/dragonfly-early-reflections/DSP.hpp
+++ b/plugins/dragonfly-early-reflections/DSP.hpp
@@ -46,8 +46,10 @@ class DragonflyReverbDSP : public AbstractDSP {
   float input_buffer[2][BUFFER_SIZE];
   float output_buffer[2][BUFFER_SIZE];
 
+#ifdef USE_PLUGIN_SIMD
   float dry_buffer[BUFFER_SIZE];
   float wet_buffer[BUFFER_SIZE];
+#endif
 
   void setInputLPF(float freq);
   void setInputHPF(float freq);
diff --git a/plugins/dragonfly-early-reflections/Makefile b/plugins/dragonfly-early-reflections/Makefile
index f0c1af3..df78bc6 100644
--- a/plugins/dragonfly-early-reflections/Makefile
+++ b/plugins/dragonfly-early-reflections/Makefile
@@ -14,10 +14,7 @@ NAME = DragonflyEarlyReflections
 
 FILES_COMMON  = DSP.cpp \
 	../../common/kiss_fft/kiss_fft.c \
-	../../common/kiss_fft/kiss_fftr.c \
-	../../common/optimization/optimization.cpp \
-	../../common/optimization/default.cpp \
-	../../common/optimization/avx.cpp
+	../../common/kiss_fft/kiss_fftr.c 
 
 ifneq ($(SYSTEM_FREEVERB3),true)
 FILES_COMMON += \
@@ -51,6 +48,11 @@ FILES_UI  = $(FILES_COMMON) \
 	../../common/Selection.cpp \
 	../../common/Bitstream_Vera_Sans_Regular.cpp
 
+# --------------------------------------------------------------
+# Check for optimization support
+
+include ../../common/optimization/Makefile.optimization.mk
+
 # --------------------------------------------------------------
 # Do some magic
 
@@ -61,7 +63,7 @@ include ../../dpf/Makefile.plugins.mk
 # --------------------------------------------------------------
 # Build dependencies
 
-BUILD_CXX_FLAGS += -I../../common -DLIBFV3_FLOAT -mavx
+BUILD_CXX_FLAGS += -I../../common -DLIBFV3_FLOAT
 ifeq ($(SYSTEM_FREEVERB3),true)
 BUILD_CXX_FLAGS += -DLIBSRATE1
 BUILD_CXX_FLAGS += $(shell $(PKG_CONFIG) --cflags freeverb3-3)
diff --git a/plugins/dragonfly-hall-reverb/DSP.cpp b/plugins/dragonfly-hall-reverb/DSP.cpp
index dcad8c1..1f12522 100644
--- a/plugins/dragonfly-hall-reverb/DSP.cpp
+++ b/plugins/dragonfly-hall-reverb/DSP.cpp
@@ -18,7 +18,9 @@
 #include "DistrhoPlugin.hpp"
 #include "DistrhoPluginInfo.h"
 #include "extra/ScopedDenormalDisable.hpp"
+#ifdef USE_PLUGIN_SIMD
 #include "optimization/optimization.hpp"
+#endif
 
 #include "DSP.hpp"
 
@@ -116,12 +118,16 @@ void DragonflyReverbDSP::run(const float** inputs, float** outputs, uint32_t fra
       early_out_buffer[0],
       early_out_buffer[1],
       buffer_frames);
-      
-    VMUL32FLOAT_V(early_send, early_out_buffer[0], early_buffer, buffer_frames);
-    VSUM32FLOAT(early_buffer, &inputs[0][offset], late_in_buffer[0], buffer_frames );
-
-    VMUL32FLOAT_V(early_send, early_out_buffer[1], early_buffer, buffer_frames);
-    VSUM32FLOAT(early_buffer, &inputs[1][offset], late_in_buffer[1], buffer_frames );
+    
+#ifdef USE_PLUGIN_SIMD
+    VMADD32FLOAT_V(early_send, early_out_buffer[0], &inputs[0][offset], late_in_buffer[0], buffer_frames );
+    VMADD32FLOAT_V(early_send, early_out_buffer[1], &inputs[1][offset], late_in_buffer[1], buffer_frames );
+#else
+    for (uint32_t i = 0; i < buffer_frames; i++) {
+      late_in_buffer[0][i] = early_send * early_out_buffer[0][i] + inputs[0][offset + i];
+      late_in_buffer[1][i] = early_send * early_out_buffer[1][i] + inputs[1][offset + i];
+    }
+#endif
     
     late.processreplace(
       const_cast<float *>(late_in_buffer[0]),
@@ -129,24 +135,39 @@ void DragonflyReverbDSP::run(const float** inputs, float** outputs, uint32_t fra
       late_out_buffer[0],
       late_out_buffer[1],
       buffer_frames);
-      
+
+#ifdef USE_PLUGIN_SIMD      
     VMUL32FLOAT_V(dryLevel, &inputs[0][offset], &outputs[0][offset], buffer_frames);
     VMUL32FLOAT_V(dryLevel, &inputs[1][offset], &outputs[1][offset], buffer_frames);
+#else
+    for (uint32_t i = 0; i < buffer_frames; i++) {
+      outputs[0][offset + i] = dryLevel   * inputs[0][offset + i];
+      outputs[1][offset + i] = dryLevel   * inputs[1][offset + i];
+    }
+#endif
 
     if( earlyLevel > 0.0 ){
-      VMUL32FLOAT_V(earlyLevel, early_out_buffer[0], early_buffer, buffer_frames);
-      VSUM32FLOAT(&outputs[0][offset], early_buffer, &outputs[0][offset], buffer_frames);
-
-      VMUL32FLOAT_V(earlyLevel, early_out_buffer[1], early_buffer, buffer_frames);
-      VSUM32FLOAT(&outputs[1][offset], early_buffer, &outputs[1][offset], buffer_frames);
+#ifdef USE_PLUGIN_SIMD      
+      VMADD32FLOAT_V(earlyLevel, early_out_buffer[0], &outputs[0][offset], &outputs[0][offset], buffer_frames );
+      VMADD32FLOAT_V(earlyLevel, early_out_buffer[1], &outputs[1][offset], &outputs[1][offset], buffer_frames );
+#else
+      for (uint32_t i = 0; i < buffer_frames; i++) {
+        outputs[0][offset + i] += earlyLevel * early_out_buffer[0][i];
+        outputs[1][offset + i] += earlyLevel * early_out_buffer[1][i];
+      }
+#endif
     }
     
     if( lateLevel > 0.0 ){
-      VMUL32FLOAT_V(lateLevel, late_out_buffer[0], late_buffer, buffer_frames);
-      VSUM32FLOAT(&outputs[0][offset], late_buffer, &outputs[0][offset], buffer_frames);
-
-      VMUL32FLOAT_V(lateLevel, late_out_buffer[1], late_buffer, buffer_frames);
-      VSUM32FLOAT(&outputs[1][offset], late_buffer, &outputs[1][offset], buffer_frames);
+#ifdef USE_PLUGIN_SIMD
+      VMADD32FLOAT_V(lateLevel, late_out_buffer[0], &outputs[0][offset], &outputs[0][offset], buffer_frames );
+      VMADD32FLOAT_V(lateLevel, late_out_buffer[1], &outputs[1][offset], &outputs[1][offset], buffer_frames );
+#else
+      for (uint32_t i = 0; i < buffer_frames; i++) {
+        outputs[0][offset + i] += lateLevel  * late_out_buffer[0][i];
+        outputs[1][offset + i] += lateLevel  * late_out_buffer[1][i];
+      }
+#endif
     }
   }
 }
diff --git a/plugins/dragonfly-hall-reverb/DSP.hpp b/plugins/dragonfly-hall-reverb/DSP.hpp
index 72102ab..1271c6e 100644
--- a/plugins/dragonfly-hall-reverb/DSP.hpp
+++ b/plugins/dragonfly-hall-reverb/DSP.hpp
@@ -47,9 +47,6 @@ class DragonflyReverbDSP : public AbstractDSP {
   float early_out_buffer[2][BUFFER_SIZE];
   float late_in_buffer[2][BUFFER_SIZE];
   float late_out_buffer[2][BUFFER_SIZE];
-
-  float early_buffer[BUFFER_SIZE];
-  float late_buffer[BUFFER_SIZE];
 };
 
 #endif
diff --git a/plugins/dragonfly-hall-reverb/Makefile b/plugins/dragonfly-hall-reverb/Makefile
index 9170538..c08ebcf 100644
--- a/plugins/dragonfly-hall-reverb/Makefile
+++ b/plugins/dragonfly-hall-reverb/Makefile
@@ -14,10 +14,7 @@ NAME = DragonflyHallReverb
 
 FILES_COMMON  = DSP.cpp \
 	../../common/kiss_fft/kiss_fft.c \
-	../../common/kiss_fft/kiss_fftr.c \
-	../../common/optimization/optimization.cpp \
-	../../common/optimization/default.cpp \
-	../../common/optimization/avx.cpp
+	../../common/kiss_fft/kiss_fftr.c
 
 ifneq ($(SYSTEM_FREEVERB3),true)
 FILES_COMMON += \
@@ -52,6 +49,11 @@ FILES_UI  = $(FILES_COMMON) \
 	../../common/Spectrogram.cpp \
 	../../common/Bitstream_Vera_Sans_Regular.cpp
 
+# --------------------------------------------------------------
+# Check for optimization support
+
+include ../../common/optimization/Makefile.optimization.mk
+
 # --------------------------------------------------------------
 # Do some magic
 
@@ -62,7 +64,7 @@ include ../../dpf/Makefile.plugins.mk
 # --------------------------------------------------------------
 # Build dependencies
 
-BUILD_CXX_FLAGS += -I../../common -DLIBFV3_FLOAT -mavx
+BUILD_CXX_FLAGS += -I../../common -DLIBFV3_FLOAT
 ifeq ($(SYSTEM_FREEVERB3),true)
 BUILD_CXX_FLAGS += -DLIBSRATE1
 BUILD_CXX_FLAGS += $(shell $(PKG_CONFIG) --cflags freeverb3-3)
diff --git a/plugins/dragonfly-plate-reverb/DSP.cpp b/plugins/dragonfly-plate-reverb/DSP.cpp
index 3a32418..b44d0cc 100644
--- a/plugins/dragonfly-plate-reverb/DSP.cpp
+++ b/plugins/dragonfly-plate-reverb/DSP.cpp
@@ -19,7 +19,9 @@
 #include "DistrhoPlugin.hpp"
 #include "DistrhoPluginInfo.h"
 #include "extra/ScopedDenormalDisable.hpp"
+#ifdef USE_PLUGIN_SIMD
 #include "optimization/optimization.hpp"
+#endif
 
 #include "DSP.hpp"
 
@@ -246,13 +248,24 @@ void DragonflyReverbDSP::run(const float** inputs, float** outputs, uint32_t fra
         buffer_frames
     );
 
+#ifdef USE_PLUGIN_SIMD
     VMUL32FLOAT_V( dryLevel, &inputs[0][offset], dry_buffer, buffer_frames);
     VMUL32FLOAT_V( wetLevel, output_buffer[0], wet_buffer, buffer_frames);
-    VSUM32FLOAT(dry_buffer, wet_buffer, &outputs[0][offset], buffer_frames);
+    VADD32FLOAT(dry_buffer, wet_buffer, &outputs[0][offset], buffer_frames);
 
     VMUL32FLOAT_V( dryLevel, &inputs[1][offset], dry_buffer, buffer_frames);
     VMUL32FLOAT_V( wetLevel, output_buffer[1], wet_buffer, buffer_frames);
-    VSUM32FLOAT(dry_buffer, wet_buffer, &outputs[1][offset], buffer_frames);
+    VADD32FLOAT(dry_buffer, wet_buffer, &outputs[1][offset], buffer_frames);
+#else
+    for (uint32_t i = 0; i < buffer_frames; i++) {
+      outputs[0][offset + i] =
+        dryLevel   * inputs[0][offset + i]  +
+        wetLevel   * output_buffer[0][i];
+      outputs[1][offset + i] =
+        dryLevel   * inputs[1][offset + i]  +
+        wetLevel   * output_buffer[1][i];
+    }
+#endif
   }
 }
 
diff --git a/plugins/dragonfly-plate-reverb/DSP.hpp b/plugins/dragonfly-plate-reverb/DSP.hpp
index da40d82..07206b2 100644
--- a/plugins/dragonfly-plate-reverb/DSP.hpp
+++ b/plugins/dragonfly-plate-reverb/DSP.hpp
@@ -79,8 +79,10 @@ class DragonflyReverbDSP : public AbstractDSP {
   float filtered_input_buffer[2][BUFFER_SIZE];
   float output_buffer[2][BUFFER_SIZE];
 
+#ifdef USE_PLUGIN_SIMD
   float dry_buffer[BUFFER_SIZE];
   float wet_buffer[BUFFER_SIZE];
+#endif
 
   void setInputLPF(float freq);
   void setInputHPF(float freq);
diff --git a/plugins/dragonfly-plate-reverb/Makefile b/plugins/dragonfly-plate-reverb/Makefile
index bddfe0e..2dfbb7f 100644
--- a/plugins/dragonfly-plate-reverb/Makefile
+++ b/plugins/dragonfly-plate-reverb/Makefile
@@ -14,10 +14,7 @@ NAME = DragonflyPlateReverb
 
 FILES_COMMON  = DSP.cpp \
 	../../common/kiss_fft/kiss_fft.c \
-	../../common/kiss_fft/kiss_fftr.c \
-	../../common/optimization/optimization.cpp \
-	../../common/optimization/default.cpp \
-	../../common/optimization/avx.cpp
+	../../common/kiss_fft/kiss_fftr.c 
 
 ifneq ($(SYSTEM_FREEVERB3),true)
 FILES_COMMON += \
@@ -52,6 +49,11 @@ FILES_UI  = $(FILES_COMMON) \
 	../../common/Spectrogram.cpp \
 	../../common/Bitstream_Vera_Sans_Regular.cpp
 
+# --------------------------------------------------------------
+# Check for optimization support
+
+include ../../common/optimization/Makefile.optimization.mk
+
 # --------------------------------------------------------------
 # Do some magic
 
@@ -62,7 +64,7 @@ include ../../dpf/Makefile.plugins.mk
 # --------------------------------------------------------------
 # Build dependencies
 
-BUILD_CXX_FLAGS += -I../../common -DLIBFV3_FLOAT -mavx
+BUILD_CXX_FLAGS += -I../../common -DLIBFV3_FLOAT
 ifeq ($(SYSTEM_FREEVERB3),true)
 BUILD_CXX_FLAGS += -DLIBSRATE1
 BUILD_CXX_FLAGS += $(shell $(PKG_CONFIG) --cflags freeverb3-3)
diff --git a/plugins/dragonfly-room-reverb/DSP.cpp b/plugins/dragonfly-room-reverb/DSP.cpp
index 8c39e5e..8cbf6fc 100644
--- a/plugins/dragonfly-room-reverb/DSP.cpp
+++ b/plugins/dragonfly-room-reverb/DSP.cpp
@@ -19,7 +19,9 @@
 #include "DistrhoPlugin.hpp"
 #include "DistrhoPluginInfo.h"
 #include "extra/ScopedDenormalDisable.hpp"
+#ifdef USE_PLUGIN_SIMD
 #include "optimization/optimization.hpp"
+#endif
 
 #include "DSP.hpp"
 
@@ -131,11 +133,15 @@ void DragonflyReverbDSP::run(const float** inputs, float** outputs, uint32_t fra
       early_out_buffer[1],
       buffer_frames);
     
-    VMUL32FLOAT_V(early_send, early_out_buffer[0], late_buffer, buffer_frames);
-    VSUM32FLOAT(late_buffer, filtered_input_buffer[0], late_in_buffer[0], buffer_frames);
-
-    VMUL32FLOAT_V(early_send, early_out_buffer[1], late_buffer, buffer_frames);
-    VSUM32FLOAT(late_buffer, filtered_input_buffer[1], late_in_buffer[1], buffer_frames);
+#ifdef USE_PLUGIN_SIMD
+    VMADD32FLOAT_V(early_send, early_out_buffer[0], filtered_input_buffer[0], late_in_buffer[0], buffer_frames );
+    VMADD32FLOAT_V(early_send, early_out_buffer[1], filtered_input_buffer[1], late_in_buffer[1], buffer_frames );
+#else
+    for (uint32_t i = 0; i < buffer_frames; i++) {
+      late_in_buffer[0][i] = early_send * early_out_buffer[0][i] + filtered_input_buffer[0][i];
+      late_in_buffer[1][i] = early_send * early_out_buffer[1][i] + filtered_input_buffer[1][i];
+    }
+#endif
     
     late.processreplace(
       const_cast<float *>(late_in_buffer[0]),
@@ -144,23 +150,38 @@ void DragonflyReverbDSP::run(const float** inputs, float** outputs, uint32_t fra
       late_out_buffer[1],
       buffer_frames);
 
+#ifdef USE_PLUGIN_SIMD
     VMUL32FLOAT_V(dryLevel, &inputs[0][offset], &outputs[0][offset], buffer_frames);
     VMUL32FLOAT_V(dryLevel, &inputs[1][offset], &outputs[1][offset], buffer_frames);
+#else
+    for (uint32_t i = 0; i < buffer_frames; i++) {
+      outputs[0][offset + i] = dryLevel   * inputs[0][offset + i];
+      outputs[1][offset + i] = dryLevel   * inputs[1][offset + i];
+    }
+#endif
     
     if( earlyLevel > 0.0 ){
-      VMUL32FLOAT_V(earlyLevel, early_out_buffer[0], early_buffer, buffer_frames);
-      VSUM32FLOAT(early_buffer, &outputs[0][offset], &outputs[0][offset], buffer_frames);
-
-      VMUL32FLOAT_V(earlyLevel, early_out_buffer[1], early_buffer, buffer_frames);
-      VSUM32FLOAT(early_buffer, &outputs[1][offset], &outputs[1][offset], buffer_frames);
+#ifdef USE_PLUGIN_SIMD
+      VMADD32FLOAT_V(earlyLevel, early_out_buffer[0], &outputs[0][offset], &outputs[0][offset], buffer_frames );
+      VMADD32FLOAT_V(earlyLevel, early_out_buffer[1], &outputs[1][offset], &outputs[1][offset], buffer_frames );
+#else
+      for (uint32_t i = 0; i < buffer_frames; i++) {
+        outputs[0][offset + i] += earlyLevel * early_out_buffer[0][i];
+        outputs[1][offset + i] += earlyLevel * early_out_buffer[1][i];
+      }
+#endif
     }
     
     if( lateLevel > 0.0 ){
-      VMUL32FLOAT_V(lateLevel, late_out_buffer[0], late_buffer, buffer_frames);
-      VSUM32FLOAT(late_buffer, &outputs[0][offset], &outputs[0][offset], buffer_frames);
-
-      VMUL32FLOAT_V(lateLevel, late_out_buffer[1], late_buffer, buffer_frames);
-      VSUM32FLOAT(late_buffer, &outputs[1][offset], &outputs[1][offset], buffer_frames);
+#ifdef USE_PLUGIN_SIMD      
+      VMADD32FLOAT_V(lateLevel, late_out_buffer[0], &outputs[0][offset], &outputs[0][offset], buffer_frames );
+      VMADD32FLOAT_V(lateLevel, late_out_buffer[1], &outputs[1][offset], &outputs[1][offset], buffer_frames );
+#else
+      for (uint32_t i = 0; i < buffer_frames; i++) {
+        outputs[0][offset + i] += lateLevel  * late_out_buffer[0][i];
+        outputs[1][offset + i] += lateLevel  * late_out_buffer[1][i];
+      }
+#endif
     }
   }
 }
diff --git a/plugins/dragonfly-room-reverb/DSP.hpp b/plugins/dragonfly-room-reverb/DSP.hpp
index 92fe2e2..b1a7aff 100644
--- a/plugins/dragonfly-room-reverb/DSP.hpp
+++ b/plugins/dragonfly-room-reverb/DSP.hpp
@@ -52,9 +52,6 @@ class DragonflyReverbDSP : public AbstractDSP {
   float late_in_buffer[2][BUFFER_SIZE];
   float late_out_buffer[2][BUFFER_SIZE];
 
-  float early_buffer[BUFFER_SIZE];
-  float late_buffer[BUFFER_SIZE];
-
   void setInputLPF(float freq);
   void setInputHPF(float freq);
 };
diff --git a/plugins/dragonfly-room-reverb/Makefile b/plugins/dragonfly-room-reverb/Makefile
index 24f0631..ff0be81 100644
--- a/plugins/dragonfly-room-reverb/Makefile
+++ b/plugins/dragonfly-room-reverb/Makefile
@@ -14,10 +14,7 @@ NAME = DragonflyRoomReverb
 
 FILES_COMMON  = DSP.cpp \
 	../../common/kiss_fft/kiss_fft.c \
-	../../common/kiss_fft/kiss_fftr.c \
-	../../common/optimization/optimization.cpp \
-	../../common/optimization/default.cpp \
-	../../common/optimization/avx.cpp
+	../../common/kiss_fft/kiss_fftr.c 
 
 ifneq ($(SYSTEM_FREEVERB3),true)
 FILES_COMMON += \
@@ -52,6 +49,11 @@ FILES_UI  = $(FILES_COMMON) \
 	../../common/Spectrogram.cpp \
 	../../common/Bitstream_Vera_Sans_Regular.cpp
 
+# --------------------------------------------------------------
+# Check for optimization support
+
+include ../../common/optimization/Makefile.optimization.mk
+
 # --------------------------------------------------------------
 # Do some magic
 
@@ -62,7 +64,7 @@ include ../../dpf/Makefile.plugins.mk
 # --------------------------------------------------------------
 # Build dependencies
 
-BUILD_CXX_FLAGS += -I../../common -DLIBFV3_FLOAT -mavx
+BUILD_CXX_FLAGS += -I../../common -DLIBFV3_FLOAT
 ifeq ($(SYSTEM_FREEVERB3),true)
 BUILD_CXX_FLAGS += -DLIBSRATE1
 BUILD_CXX_FLAGS += $(shell $(PKG_CONFIG) --cflags freeverb3-3)