michaelwillis · manuvi · Nov 14, 2023 · Nov 15, 2023
diff --git a/Makefile b/Makefile
@@ -48,6 +48,7 @@ clean:
 	rm -f common/*.d common/*.o
 	rm -f common/freeverb/*.d common/freeverb/*.o
 	rm -f common/kiss_fft/*.d common/kiss_fft/*.o
+	rm -f common/optimization/*.d common/optimization/*.o
 	rm -f dpf/utils/lv2_ttl_generator.d
 
 # --------------------------------------------------------------

diff --git a/common/optimization/Makefile.optimization.mk b/common/optimization/Makefile.optimization.mk
@@ -0,0 +1,41 @@
+
+ifeq ($(OS),Windows_NT)
+    MACHINE = WIN32
+    ifeq ($(PROCESSOR_ARCHITECTURE),AMD64)
+    	ARCH = AMD64
+    endif
+    ifeq ($(PROCESSOR_ARCHITECTURE),x86)
+        ARCH = IA32
+    endif
+else
+    UNAME_M := $(shell uname -m)
+    ifeq ($(UNAME_M),x86_64)
+        ARCH = AMD64
+    endif
+    ifneq ($(filter %86,$(UNAME_M)),)
+        ARCH += IA32
+    endif
+    ifneq ($(filter arm%,$(UNAME_M)),)
+        ARCH += ARM
+    endif
+endif
+
+ifeq ($(ARCH), AMD64)
+USE_PLUGIN_SIMD = true
+CXXFLAGS += -mavx -mfma -DUSE_PLUGIN_SIMD
+endif
+
+# --------------------------
+# FIle Inclusion
+ifeq ($(USE_PLUGIN_SIMD), true)
+FILES_COMMON  += \
+	../../common/optimization/optimization.cpp \
+	../../common/optimization/default.cpp
+
+ifeq ($(ARCH), AMD64)
+FILES_COMMON  += \
+	../../common/optimization/avx.cpp
+endif
+
+endif
+
diff --git a/common/optimization/avx.cpp b/common/optimization/avx.cpp
@@ -0,0 +1,124 @@
+#include "avx.hpp"
+
+#include <immintrin.h>
+
+int8_t AVXInitialized = 0;
+int8_t FMASupported = 0;
+
+void CheckFMASupport()
+{
+    if ( !AVXInitialized )
+    {
+        __builtin_cpu_init();
+        if ( __builtin_cpu_supports("fma") )
+        {
+            FMASupported = 1;
+        }
+
+        AVXInitialized = 1;
+    }
+}
+
+void VADD32FLOAT_avx(const float* op1, const float* op2, float* result, int32_t length)
+{
+    float extracted[8];
+    int32_t processed = 0;
+
+    while ( processed < length )
+    {
+        int32_t processing = ((length - processed) % 8) + 1;
+
+        const __m256 mm_op1 = _mm256_loadu_ps(op1 + processed);
+        const __m256 mm_op2 = _mm256_loadu_ps(op2 + processed );
+        __m256 mm_res = _mm256_add_ps(mm_op1, mm_op2);
+
+        _mm256_storeu_ps(extracted, mm_res);
+        for ( int32_t i = 0 ; i < processing ; i++ )
+        {
+            result[processed + i] = extracted[i];
+        }
+
+        processed += processing;
+    }
+}
+
+void VMUL32FLOAT_avx(const float* op1, const float* op2, float* result, int32_t length)
+{
+    float extracted[8];
+    int32_t processed = 0;
+
+    while ( processed < length )
+    {
+        int32_t processing = ((length - processed) % 8) + 1;
+        const __m256 mm_op1 = _mm256_loadu_ps(op1 + processed);
+        const __m256 mm_op2 = _mm256_loadu_ps(op2 + processed);
+        __m256 mm_res = _mm256_mul_ps(mm_op1, mm_op2);
+
+        _mm256_storeu_ps(extracted, mm_res);
+        for ( int32_t i = 0 ; i < processing ; i++ )
+        {
+            result[processed + i] = extracted[i];
+        }
+
+        processed += processing;
+    }
+}
+
+void VMUL32FLOAT_V_avx(const float value, const float* vec, float* result, int32_t length)
+{    
+    float extracted[8];
+    const __m256 mm_op1 = _mm256_set1_ps(value);
+
+    int32_t processed = 0;
+    while ( processed < length )
+    {
+        int32_t processing = ((length - processed) % 8) + 1;
+
+        const __m256 mm_op2 = _mm256_loadu_ps(vec + processed);
+        __m256 mm_res = _mm256_mul_ps(mm_op1, mm_op2);
+
+        _mm256_storeu_ps(extracted, mm_res);
+        for ( int32_t i = 0 ; i < processing ; i++ )
+        {
+            result[processed + i] = extracted[i];
+        }
+
+        processed += processing;
+    }
+}
+
+void VMADD32FLOAT_V_avx(const float value, const float* vmul, const float* vadd, float* result, int32_t length)
+{
+    float extracted[8];
+    const __m256 mm_val = _mm256_set1_ps(value);
+
+    CheckFMASupport();
+
+    int32_t processed = 0;
+    while ( processed < length )
+    {
+        int32_t processing = ((length - processed) % 8) + 1;
+
+        const __m256 mm_mul = _mm256_loadu_ps(vmul + processed);
+        const __m256 mm_add = _mm256_loadu_ps(vadd + processed);
+
+        __m256 mm_res;
+        if ( FMASupported )
+        {
+            mm_res = _mm256_fmadd_ps(mm_val, mm_mul, mm_add);
+        }
+        else
+        {
+            mm_res = _mm256_mul_ps(mm_val, mm_mul);
+            mm_res = _mm256_add_ps(mm_res, mm_add);
+        }
+
+        _mm256_storeu_ps(extracted, mm_res);
+        for ( int32_t i = 0 ; i < processing ; i++ )
+        {
+            result[processed + i] = extracted[i];
+        }
+
+        processed += processing;
+    }
+}
diff --git a/common/optimization/avx.hpp b/common/optimization/avx.hpp
@@ -0,0 +1,15 @@
+
+#ifndef OPTIMIZATION_AVX_H_INCLUDED
+#define OPTIMIZATION_AVX_H_INCLUDED
+
+#include <stdint.h>
+
+void VADD32FLOAT_avx(const float* op1, const float* op2, float* result, int32_t length);
+
+void VMUL32FLOAT_avx(const float* op1, const float* op2, float* result, int32_t length);
+
+void VMUL32FLOAT_V_avx(const float value, const float* vec, float* result, int32_t length);
+
+void VMADD32FLOAT_V_avx(const float value, const float* vmul, const float* vadd, float* result, int32_t length);
+
+#endif
diff --git a/common/optimization/default.cpp b/common/optimization/default.cpp
@@ -0,0 +1,34 @@
+#include "default.hpp"
+
+void VADD32FLOAT_default(const float* op1, const float* op2, float* result, int32_t length)
+{
+    for ( int32_t i = 0 ; i < length ; i++ )
+    {
+        result[i] = op1[i] + op2[i];
+    }
+}
+
+void VMUL32FLOAT_default(const float* op1, const float* op2, float* result, int32_t length)
+{
+    for ( int32_t i = 0 ; i < length ; i++ )
+    {
+        result[i] = op1[i] * op2[i];
+    }
+}
+
+void VMUL32FLOAT_V_default(const float value, const float* vec, float* result, int32_t length)
+{
+    for ( int32_t i = 0 ; i < length ; i++ )
+    {
+        result[i] = value * vec[i];
+    }
+}
+
+void VMADD32FLOAT_V_default(const float value, const float* vmul, const float* vadd, float* result, int32_t length)
+{
+    for ( int32_t i = 0 ; i < length ; i++ )
+    {
+        result[i] = (value * vmul[i]) + vadd[i];
+    }
+}
+
diff --git a/common/optimization/default.hpp b/common/optimization/default.hpp
@@ -0,0 +1,15 @@
+
+#ifndef OPTIMIZATION_DEFAULT_H_INCLUDED
+#define OPTIMIZATION_DEFAULT_H_INCLUDED
+
+#include <stdint.h>
+
+void VADD32FLOAT_default(const float* op1, const float* op2, float* result, int32_t length);
+
+void VMUL32FLOAT_default(const float* op1, const float* op2, float* result, int32_t length);
+
+void VMUL32FLOAT_V_default(const float value, const float* vec, float* result, int32_t length);
+
+void VMADD32FLOAT_V_default(const float value, const float* vmul, const float* vadd, float* result, int32_t length);
+
+#endif
diff --git a/common/optimization/optimization.cpp b/common/optimization/optimization.cpp
@@ -0,0 +1,80 @@
+
+#include "optimization.hpp"
+#include <stdint.h>
+
+#include "default.hpp"
+#include "avx.hpp"
+
+typedef struct 
+{
+    void (*VADD32FLOAT)(const float* op1, const float* op2, float* result, int32_t length);    
+    void (*VMUL32FLOAT)(const float* op1, const float* op2, float* result, int32_t length);    
+    void (*VMUL32FLOAT_V)(const float value, const float* vec, float* result, int32_t length);
+    void (*VMADD32FLOAT_V)(const float value, const float* vmul, const float* vadd, float* result, int32_t length);
+} _CpuOptimization;
+
+int8_t OptimizationInitialized = 0;
+_CpuOptimization CpuOptimization;
+
+void SetupOptimization()
+{
+    __builtin_cpu_init();
+#if defined(__x86_64__) 
+    if ( __builtin_cpu_supports("avx") )
+    {
+        CpuOptimization.VADD32FLOAT = VADD32FLOAT_avx;
+        CpuOptimization.VMUL32FLOAT = VMUL32FLOAT_avx;
+        CpuOptimization.VMUL32FLOAT_V = VMUL32FLOAT_V_avx;
+        CpuOptimization.VMADD32FLOAT_V = VMADD32FLOAT_V_avx;
+    }
+    else
+#endif
+    {
+        CpuOptimization.VADD32FLOAT = VADD32FLOAT_default;
+        CpuOptimization.VMUL32FLOAT = VMUL32FLOAT_default;
+        CpuOptimization.VMUL32FLOAT_V = VMUL32FLOAT_V_default;
+        CpuOptimization.VMADD32FLOAT_V = VMADD32FLOAT_V_default;
+    }
+    OptimizationInitialized = 1;
+}
+
+
+void VADD32FLOAT(const float* op1, const float* op2, float* result, int32_t length)
+{
+    if (!OptimizationInitialized)
+    {
+        SetupOptimization();
+    }
+
+    CpuOptimization.VADD32FLOAT(op1,op2,result,length);
+}
+
+void VMUL32FLOAT(const float* op1, const float* op2, float* result, int32_t length)
+{
+    if (!OptimizationInitialized)
+    {
+        SetupOptimization();
+    }
+
+    CpuOptimization.VMUL32FLOAT(op1,op2,result,length);
+}
+
+void VMUL32FLOAT_V(const float value, const float* vec, float* result, int32_t length)
+{
+    if (!OptimizationInitialized)
+    {
+        SetupOptimization();
+    }
+
+    CpuOptimization.VMUL32FLOAT_V(value,vec,result,length);
+}
+
+void VMADD32FLOAT_V(const float value, const float* vmul, const float* vadd, float* result, int32_t length)
+{
+    if (!OptimizationInitialized)
+    {
+        SetupOptimization();
+    }
+
+    CpuOptimization.VMADD32FLOAT_V(value,vmul,vadd,result,length);
+}
diff --git a/common/optimization/optimization.hpp b/common/optimization/optimization.hpp
@@ -0,0 +1,15 @@
+
+#ifndef OPTIMIZATION_H_INCLUDED
+#define OPTIMIZATION_H_INCLUDED
+
+#include <stdint.h>
+
+void VADD32FLOAT(const float* op1, const float* op2, float* result, int32_t length);
+
+void VMUL32FLOAT(const float* op1, const float* op2, float* result, int32_t length);
+
+void VMUL32FLOAT_V(const float value, const float* vec, float* result, int32_t length);
+
+void VMADD32FLOAT_V(const float value, const float* vmul, const float* vadd, float* result, int32_t length);
+
+#endif
diff --git a/plugins/dragonfly-early-reflections/DSP.cpp b/plugins/dragonfly-early-reflections/DSP.cpp
@@ -20,6 +20,10 @@
 #include "DistrhoPluginInfo.h"
 #include "extra/ScopedDenormalDisable.hpp"
 
+#ifdef USE_PLUGIN_SIMD
+#include "optimization/optimization.hpp"
+#endif
+
 #include "DSP.hpp"
 
 DragonflyReverbDSP::DragonflyReverbDSP(double sampleRate) {
@@ -91,6 +95,15 @@ void DragonflyReverbDSP::run(const float** inputs, float** outputs, uint32_t fra
         buffer_frames
     );
 
+#ifdef USE_PLUGIN_SIMD
+    VMUL32FLOAT_V( dryLevel, &inputs[0][offset], dry_buffer, buffer_frames );
+    VMUL32FLOAT_V( wetLevel, output_buffer[0], wet_buffer, buffer_frames );
+    VADD32FLOAT(dry_buffer, wet_buffer, &outputs[0][offset], buffer_frames );
+
+    VMUL32FLOAT_V( dryLevel, &inputs[1][offset], dry_buffer, buffer_frames );
+    VMUL32FLOAT_V( wetLevel, output_buffer[1], wet_buffer, buffer_frames );
+    VADD32FLOAT(dry_buffer, wet_buffer, &outputs[1][offset], buffer_frames );
+#else
     for (uint32_t i = 0; i < buffer_frames; i++) {
       outputs[0][offset + i] =
         dryLevel * inputs[0][offset + i]  +
@@ -100,7 +113,7 @@ void DragonflyReverbDSP::run(const float** inputs, float** outputs, uint32_t fra
         dryLevel * inputs[1][offset + i]  +
         wetLevel * output_buffer[1][i];
     }
-
+#endif
   }
 }
 

diff --git a/plugins/dragonfly-early-reflections/DSP.hpp b/plugins/dragonfly-early-reflections/DSP.hpp
@@ -46,6 +46,11 @@ class DragonflyReverbDSP : public AbstractDSP {
   float input_buffer[2][BUFFER_SIZE];
   float output_buffer[2][BUFFER_SIZE];
 
+#ifdef USE_PLUGIN_SIMD
+  float dry_buffer[BUFFER_SIZE];
+  float wet_buffer[BUFFER_SIZE];
+#endif
+
   void setInputLPF(float freq);
   void setInputHPF(float freq);
 };