diff -Naurd a/math/Simd.cpp b/math/Simd.cpp
--- a/math/Simd.cpp 2024-10-29 03:55:07.000000000 +0100
+++ b/math/Simd.cpp 2024-11-15 16:12:56.489496800 +0100
@@ -43,18 +43,19 @@
#include "idlib/math/Simd_SSE.h"
#include "idlib/math/Simd_SSE2.h"
#include "idlib/math/Simd_SSE3.h"
+#include "idlib/math/Simd_AVX.h"
+#include "idlib/math/Simd_AVX2.h"
#include "idlib/math/Simd_AltiVec.h"
#include "idlib/math/Plane.h"
#include "idlib/bv/Bounds.h"
#include "idlib/Lib.h"
#include "framework/Common.h"
#include "renderer/Model.h"
-
#include "idlib/math/Simd.h"
-idSIMDProcessor * processor = NULL; // pointer to SIMD processor
-idSIMDProcessor * generic = NULL; // pointer to generic SIMD implementation
-idSIMDProcessor * SIMDProcessor = NULL;
+idSIMDProcessor ยจ*processor = NULL; // pointer to SIMD processor
+idSIMDProcessor *generic = NULL; // pointer to generic SIMD implementation
+idSIMDProcessor *SIMDProcessor = NULL;
/*
================
@@ -74,36 +75,99 @@
============
*/
void idSIMD::InitProcessor( const char *module, bool forceGeneric ) {
- int cpuid;
- idSIMDProcessor *newProcessor;
+ int cpuid = idLib::sys->GetProcessorId();
- cpuid = idLib::sys->GetProcessorId();
+ if ( processor != generic ) {
+ delete processor;
+ processor = NULL;
+ SIMDProcessor = generic;
+ }
+ idSIMDProcessor *newProcessor = NULL;
- if ( forceGeneric ) {
+// stgatilov: force cpuid bits for SIMD choice if compiler macros are set
+// this is used for Elbrus compiler, which can cross-compile SSE intrinsics but has no CPUID instruction
+#ifdef __MMX__
+ cpuid |= CPUID_MMX;
+#endif
+#ifdef __3dNOW__
+ cpuid |= CPUID_3DNOW;
+#endif
+#ifdef __SSE__
+ cpuid |= CPUID_SSE;
+#endif
+#ifdef __SSE2__
+ cpuid |= CPUID_SSE2;
+#endif
+#ifdef __SSE3__
+ cpuid |= CPUID_SSE3;
+#endif
+#ifdef __SSE4_1__
+ cpuid |= CPUID_SSE41;
+#endif
+#ifdef __AVX__
+ cpuid |= CPUID_AVX;
+#endif
+#ifdef __AVX2__
+ cpuid |= CPUID_AVX2;
+ cpuid |= CPUID_FMA3;
+#endif
+#ifdef __ALTIVEC__
+ cpuid |= CPUID_ALTIVEC;
+#endif
- newProcessor = generic;
+ // Print what we found to console
+ idLib::common->Printf( "Found %s CPU, With these features: %s %s %s %s %s %s %s %s %s %s %s\n",
+ // Vendor
+ cpuid & CPUID_AMD ? "AMD" :
+ cpuid & CPUID_INTEL ? "Intel" :
+ cpuid & CPUID_GENERIC ? "Generic" :
+ "Unsupported",
+ // Flags
+ cpuid & CPUID_MMX ? " MMX" : "",
+ cpuid & CPUID_3DNOW ? " 3DNow" : "",
+ cpuid & CPUID_SSE ? " SSE" : "",
+ cpuid & CPUID_SSE2 ? " SSE2" : "",
+ cpuid & CPUID_SSE3 ? " SSE3" : "",
+ cpuid & CPUID_SSE41 ? " SSE41" : "",
+ cpuid & CPUID_AVX ? " AVX" : "",
+ cpuid & CPUID_AVX2 ? " AVX2" : "",
+ cpuid & CPUID_FMA3 ? " FMA3" : "",
+ cpuid & CPUID_ALTIVEC ? " ALTIVEC" : "" );
+ if ( forceGeneric ) {
+ newProcessor = generic;
} else {
-
if ( !processor ) {
- if ( ( cpuid & CPUID_ALTIVEC ) ) {
+ bool upToMMX = ( cpuid & CPUID_MMX );
+ bool upTo3DNow = upToMMX && ( cpuid & CPUID_3DNOW ) && ( cpuid & CPUID_AMD ); // newer AMD processors no longer support this.
+ bool upToSSE = upToMMX && ( cpuid & CPUID_SSE );
+ bool upToSSE2 = upToSSE && ( cpuid & CPUID_SSE2 );
+ bool upToSSE3 = upToSSE2 && ( cpuid & CPUID_SSE3 );
+ bool upToAVX = upToSSE3 && ( cpuid & CPUID_AVX );
+ bool upToAVX2 = upToAVX && ( cpuid & CPUID_AVX2 ) && ( cpuid & CPUID_FMA3 );
+ bool isAlTiVec = ( cpuid & CPUID_ALTIVEC ); // unused on anything but apple i think...
+
+ if ( isAlTiVec ) {
processor = new idSIMD_AltiVec;
- } else if ( ( cpuid & CPUID_MMX ) && ( cpuid & CPUID_SSE ) && ( cpuid & CPUID_SSE2 ) && ( cpuid & CPUID_SSE3 ) ) {
+ } else if ( upToAVX2 ) {
+ processor = new idSIMD_AVX2;
+ } else if ( upToAVX ) {
+ processor = new idSIMD_AVX;
+ } else if ( upToSSE3 ) {
processor = new idSIMD_SSE3;
- } else if ( ( cpuid & CPUID_MMX ) && ( cpuid & CPUID_SSE ) && ( cpuid & CPUID_SSE2 ) ) {
+ } else if ( upToSSE2 ) {
processor = new idSIMD_SSE2;
- } else if ( ( cpuid & CPUID_MMX ) && ( cpuid & CPUID_SSE ) ) {
+ } else if ( upToSSE ) {
processor = new idSIMD_SSE;
- } else if ( ( cpuid & CPUID_MMX ) && ( cpuid & CPUID_3DNOW ) ) {
+ } else if ( upTo3DNow ) {
processor = new idSIMD_3DNow;
- } else if ( ( cpuid & CPUID_MMX ) ) {
+ } else if ( upToMMX ) {
processor = new idSIMD_MMX;
} else {
processor = generic;
}
processor->cpuid = cpuid;
}
-
newProcessor = processor;
}
@@ -112,7 +176,7 @@
idLib::common->Printf( "%s using %s for SIMD processing\n", module, SIMDProcessor->GetName() );
}
- if ( cpuid & CPUID_SSE ) {
+ if ( cpuid & CPUID_SSE2 ) {
idLib::sys->FPU_SetFTZ( true );
idLib::sys->FPU_SetDAZ( true );
}
@@ -4038,6 +4102,18 @@
return;
}
p_simd = new idSIMD_SSE3();
+ } else if ( idStr::Icmp( argString, "AVX" ) == 0 ) {
+ if ( !( cpuid & CPUID_SSE ) || !( cpuid & CPUID_SSE2 ) || !( cpuid & CPUID_SSE3 ) || !( cpuid & CPUID_AVX ) ) {
+ common->Printf( "CPU does not support SSE* & AVX\n" );
+ return;
+ }
+ p_simd = new idSIMD_AVX();
+ } else if ( idStr::Icmp( argString, "AVX2" ) == 0 ) {
+ if ( !( cpuid & CPUID_SSE ) || !( cpuid & CPUID_SSE2 ) || !( cpuid & CPUID_SSE3 ) || !( cpuid & CPUID_AVX ) || !( cpuid & CPUID_AVX2 ) || !( cpuid & CPUID_FMA3 ) ) {
+ common->Printf( "CPU does not support SSE* & AVX & AVX2 & FMA3\n" );
+ return;
+ }
+ p_simd = new idSIMD_AVX2();
} else if ( idStr::Icmp( argString, "AltiVec" ) == 0 ) {
if ( !( cpuid & CPUID_ALTIVEC ) ) {
common->Printf( "CPU does not support AltiVec\n" );
diff -Naurd a/math/Simd.h b/math/Simd.h
--- a/math/Simd.h 2024-10-29 03:55:07.000000000 +0100
+++ b/math/Simd.h 2024-11-10 17:54:55.405941100 +0100
@@ -45,12 +45,11 @@
class idSIMD {
public:
static void Init( void );
- static void InitProcessor( const char *module, bool forceGeneric );
+ static void InitProcessor( const char *module, bool forceGeneric = false );
static void Shutdown( void );
static void Test_f( const class idCmdArgs &args );
};
-
/*
===============================================================================
@@ -94,11 +93,10 @@
SPEAKER_BACKRIGHT
} speakerLabel;
-
class idSIMDProcessor {
public:
idSIMDProcessor( void ) { cpuid = CPUID_NONE; }
- virtual ~idSIMDProcessor() { };
+ virtual ~idSIMDProcessor() {};
int cpuid;
@@ -199,6 +197,10 @@
virtual void VPCALL MixSoundSixSpeakerMono( float *mixBuffer, const float *samples, const int numSamples, const float lastV[6], const float currentV[6] ) = 0;
virtual void VPCALL MixSoundSixSpeakerStereo( float *mixBuffer, const float *samples, const int numSamples, const float lastV[6], const float currentV[6] ) = 0;
virtual void VPCALL MixedSoundToSamples( short *samples, const float *mixBuffer, const int numSamples ) = 0;
+
+ // plane culling
+ virtual void VPCALL CullByFrustum( idDrawVert *verts, const int numVerts, const idPlane frustum[6], byte *pointCull, float epsilon ) = 0;
+ virtual void VPCALL CullByFrustum2( idDrawVert *verts, const int numVerts, const idPlane frustum[6], unsigned short *pointCull, float epsilon ) = 0;
};
// pointer to SIMD processor
diff -Naurd a/math/Simd_AVX.cpp b/math/Simd_AVX.cpp
--- a/math/Simd_AVX.cpp 1970-01-01 01:00:00.000000000 +0100
+++ b/math/Simd_AVX.cpp 2024-11-15 15:40:03.886268200 +0100
@@ -0,0 +1,139 @@
+/*
+===========================================================================
+
+Doom 3 GPL Source Code
+Copyright (C) 1999-2011 id Software LLC, a ZeniMax Media company.
+
+This file is part of the Doom 3 GPL Source Code ("Doom 3 Source Code").
+
+Doom 3 Source Code is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+Doom 3 Source Code is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with Doom 3 Source Code. If not, see .
+
+In addition, the Doom 3 Source Code is also subject to certain additional terms. You should have received a copy of these additional terms immediately following the terms and conditions of the GNU General Public License which accompanied the Doom 3 Source Code. If not, please request a copy in writing from id Software at the address below.
+
+If you have questions concerning this license or the applicable additional terms, you may contact in writing id Software LLC, c/o ZeniMax Media Inc., Suite 120, Rockville, Maryland 20850 USA.
+
+===========================================================================
+*/
+
+#include "sys/platform.h"
+#include "Simd_AVX.h"
+
+//===============================================================
+//
+// AVX implementation of idSIMDProcessor
+//
+//===============================================================
+
+#if defined(__GNUC__) && defined(__SSE3__)
+
+/*
+============
+idSIMD_SSE3::GetName
+============
+*/
+const char *idSIMD_AVX::GetName( void ) const {
+ return "MMX & SSE & SSE2 & SSE3 & AVX";
+}
+
+#elif defined(_MSC_VER) && defined(_M_IX86)
+
+#include
+
+#include "idlib/geometry/DrawVert.h"
+#include "idlib/geometry/JointTransform.h"
+#include "idlib/math/Vector.h"
+#include "idlib/math/Plane.h"
+
+/*
+============
+idSIMD_AVX::GetName
+============
+*/
+const char *idSIMD_AVX::GetName( void ) const {
+ return "MMX & SSE & SSE2 & SSE3 & AVX";
+}
+
+/*
+============
+idSIMD_AVX::CullByFrustum
+============
+*/
+void VPCALL idSIMD_AVX::CullByFrustum( idDrawVert *verts, const int numVerts, const idPlane frustum[6], byte *pointCull, float epsilon ) {
+ const __m256 fA = _mm256_set_ps( 0, 0, frustum[5][0], frustum[4][0], frustum[3][0], frustum[2][0], frustum[1][0], frustum[0][0] );
+ const __m256 fB = _mm256_set_ps( 0, 0, frustum[5][1], frustum[4][1], frustum[3][1], frustum[2][1], frustum[1][1], frustum[0][1] );
+ const __m256 fC = _mm256_set_ps( 0, 0, frustum[5][2], frustum[4][2], frustum[3][2], frustum[2][2], frustum[1][2], frustum[0][2] );
+ const __m256 fD = _mm256_set_ps( 0, 0, frustum[5][3], frustum[4][3], frustum[3][3], frustum[2][3], frustum[1][3], frustum[0][3] );
+ const __m256 eps = _mm256_set1_ps( epsilon );
+ const byte mask6 = ( 1 << 6 ) - 1;
+
+ for( int j = 0; j < numVerts; j++ ) {
+ idVec3 &vec = verts[j].xyz;
+ __m256 vX = _mm256_set1_ps( vec.x );
+ __m256 vY = _mm256_set1_ps( vec.y );
+ __m256 vZ = _mm256_set1_ps( vec.z );
+ __m256 d =
+ _mm256_add_ps(
+ _mm256_add_ps(
+ _mm256_mul_ps( fA, vX ),
+ _mm256_mul_ps( fB, vY )
+ ),
+ _mm256_add_ps(
+ _mm256_mul_ps( fC, vZ ),
+ fD
+ )
+ );
+ int mask_lo = _mm256_movemask_ps( _mm256_cmp_ps( d, eps, _CMP_LT_OQ ) );
+ pointCull[j] = ( byte )mask_lo & mask6;
+ }
+ _mm256_zeroupper();
+}
+
+/*
+============
+idSIMD_AVX::CullByFrustum2
+============
+*/
+void VPCALL idSIMD_AVX::CullByFrustum2( idDrawVert *verts, const int numVerts, const idPlane frustum[6], unsigned short *pointCull, float epsilon ) {
+ const __m256 fA = _mm256_set_ps( 0, 0, frustum[5][0], frustum[4][0], frustum[3][0], frustum[2][0], frustum[1][0], frustum[0][0] );
+ const __m256 fB = _mm256_set_ps( 0, 0, frustum[5][1], frustum[4][1], frustum[3][1], frustum[2][1], frustum[1][1], frustum[0][1] );
+ const __m256 fC = _mm256_set_ps( 0, 0, frustum[5][2], frustum[4][2], frustum[3][2], frustum[2][2], frustum[1][2], frustum[0][2] );
+ const __m256 fD = _mm256_set_ps( 0, 0, frustum[5][3], frustum[4][3], frustum[3][3], frustum[2][3], frustum[1][3], frustum[0][3] );
+ const __m256 eps = _mm256_set1_ps( epsilon );
+ static const __m256 epsM = _mm256_set1_ps( -epsilon );
+ const short mask6 = ( 1 << 6 ) - 1;
+
+ for( int j = 0; j < numVerts; j++ ) {
+ idVec3 &vec = verts[j].xyz;
+ __m256 vX = _mm256_set1_ps( vec.x );
+ __m256 vY = _mm256_set1_ps( vec.y );
+ __m256 vZ = _mm256_set1_ps( vec.z );
+ __m256 d =
+ _mm256_add_ps(
+ _mm256_add_ps(
+ _mm256_mul_ps( fA, vX ),
+ _mm256_mul_ps( fB, vY )
+ ),
+ _mm256_add_ps(
+ _mm256_mul_ps( fC, vZ ),
+ fD
+ )
+ );
+ int mask_lo = _mm256_movemask_ps( _mm256_cmp_ps( d, eps, _CMP_LT_OQ ) );
+ int mask_hi = _mm256_movemask_ps( _mm256_cmp_ps( d, eps, _CMP_GT_OQ ) );
+ pointCull[j] = ( unsigned short )( mask_lo & mask6 | ( mask_hi & mask6 ) << 6 );
+ }
+ _mm256_zeroupper();
+}
+
+#endif
diff -Naurd a/math/Simd_AVX.h b/math/Simd_AVX.h
--- a/math/Simd_AVX.h 1970-01-01 01:00:00.000000000 +0100
+++ b/math/Simd_AVX.h 2024-11-08 21:03:16.820910100 +0100
@@ -0,0 +1,40 @@
+/*****************************************************************************
+ The Dark Mod GPL Source Code
+
+ This file is part of the The Dark Mod Source Code, originally based
+ on the Doom 3 GPL Source Code as published in 2011.
+
+ The Dark Mod Source Code is free software: you can redistribute it
+ and/or modify it under the terms of the GNU General Public License as
+ published by the Free Software Foundation, either version 3 of the License,
+ or (at your option) any later version. For details, see LICENSE.TXT.
+
+ Project: The Dark Mod (http://www.thedarkmod.com/)
+
+******************************************************************************/
+
+#ifndef __MATH_SIMD_AVX_H__
+#define __MATH_SIMD_AVX_H__
+
+#include "idlib/math/Simd_SSE3.h"
+
+/*
+===============================================================================
+
+ AVX implementation of idSIMDProcessor
+
+===============================================================================
+*/
+
+class idSIMD_AVX : public idSIMD_SSE3 {
+public:
+#if defined(__GNUC__) && defined(__AVX__)
+ virtual const char *VPCALL GetName( void ) const;
+#elif defined(_MSC_VER) && defined(_M_IX86)
+ virtual const char *VPCALL GetName( void ) const;
+ virtual void VPCALL CullByFrustum( idDrawVert *verts, const int numVerts, const idPlane frustum[6], byte *pointCull, float epsilon );
+ virtual void VPCALL CullByFrustum2( idDrawVert *verts, const int numVerts, const idPlane frustum[6], unsigned short *pointCull, float epsilon );
+#endif
+};
+
+#endif /* !__MATH_SIMD_AVX_H__ */
diff -Naurd a/math/Simd_AVX2.cpp b/math/Simd_AVX2.cpp
--- a/math/Simd_AVX2.cpp 1970-01-01 01:00:00.000000000 +0100
+++ b/math/Simd_AVX2.cpp 2024-11-15 15:40:03.886268200 +0100
@@ -0,0 +1,128 @@
+/*
+===========================================================================
+
+Doom 3 GPL Source Code
+Copyright (C) 1999-2011 id Software LLC, a ZeniMax Media company.
+
+This file is part of the Doom 3 GPL Source Code ("Doom 3 Source Code").
+
+Doom 3 Source Code is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+Doom 3 Source Code is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with Doom 3 Source Code. If not, see .
+
+In addition, the Doom 3 Source Code is also subject to certain additional terms. You should have received a copy of these additional terms immediately following the terms and conditions of the GNU General Public License which accompanied the Doom 3 Source Code. If not, please request a copy in writing from id Software at the address below.
+
+If you have questions concerning this license or the applicable additional terms, you may contact in writing id Software LLC, c/o ZeniMax Media Inc., Suite 120, Rockville, Maryland 20850 USA.
+
+===========================================================================
+*/
+
+#include "sys/platform.h"
+#include "Simd_AVX2.h"
+
+//===============================================================
+//
+// AVX implementation of idSIMDProcessor
+//
+//===============================================================
+
+#if defined(__GNUC__) && defined(__SSE3__)
+
+/*
+============
+idSIMD_SSE3::GetName
+============
+*/
+const char *idSIMD_AVX:2:GetName( void ) const {
+ return "MMX & SSE & SSE2 & SSE3 & AVX & AVX2";
+}
+
+#elif defined(_MSC_VER) && defined(_M_IX86)
+
+#include
+
+#include "idlib/geometry/DrawVert.h"
+#include "idlib/geometry/JointTransform.h"
+#include "idlib/math/Vector.h"
+#include "idlib/math/Plane.h"
+
+/*
+============
+idSIMD_AVX2::GetName
+============
+*/
+const char *idSIMD_AVX2::GetName( void ) const {
+ return "MMX & SSE & SSE2 & SSE3 & AVX & AVX2";
+}
+
+/*
+============
+idSIMD_AVX2::CullByFrustum
+============
+*/
+void VPCALL idSIMD_AVX2::CullByFrustum( idDrawVert *verts, const int numVerts, const idPlane frustum[6], byte *pointCull, float epsilon ) {
+ const __m256 fA = _mm256_set_ps( 0, 0, frustum[5][0], frustum[4][0], frustum[3][0], frustum[2][0], frustum[1][0], frustum[0][0] );
+ const __m256 fB = _mm256_set_ps( 0, 0, frustum[5][1], frustum[4][1], frustum[3][1], frustum[2][1], frustum[1][1], frustum[0][1] );
+ const __m256 fC = _mm256_set_ps( 0, 0, frustum[5][2], frustum[4][2], frustum[3][2], frustum[2][2], frustum[1][2], frustum[0][2] );
+ const __m256 fD = _mm256_set_ps( 0, 0, frustum[5][3], frustum[4][3], frustum[3][3], frustum[2][3], frustum[1][3], frustum[0][3] );
+ const __m256 eps = _mm256_set1_ps( epsilon );
+ const byte mask6 = (1 << 6) - 1;
+
+ for ( int j = 0; j < numVerts; j++ ) {
+ idVec3 &vec = verts[j].xyz;
+ __m256 vX = _mm256_set1_ps( vec.x );
+ __m256 vY = _mm256_set1_ps( vec.y );
+ __m256 vZ = _mm256_set1_ps( vec.z );
+ __m256 d = _mm256_fmadd_ps( fA, vX,
+ _mm256_fmadd_ps( fB, vY,
+ _mm256_fmadd_ps( fC, vZ, fD )
+ )
+ );
+ int mask_lo = _mm256_movemask_ps( _mm256_cmp_ps( d, eps, _CMP_LT_OQ ) );
+ pointCull[j] = (byte)mask_lo & mask6;
+ }
+ _mm256_zeroupper();
+}
+
+/*
+============
+idSIMD_AVX2::CullByFrustum2
+============
+*/
+void VPCALL idSIMD_AVX2::CullByFrustum2( idDrawVert *verts, const int numVerts, const idPlane frustum[6], unsigned short *pointCull, float epsilon ) {
+ const __m256 fA = _mm256_set_ps( 0, 0, frustum[5][0], frustum[4][0], frustum[3][0], frustum[2][0], frustum[1][0], frustum[0][0] );
+ const __m256 fB = _mm256_set_ps( 0, 0, frustum[5][1], frustum[4][1], frustum[3][1], frustum[2][1], frustum[1][1], frustum[0][1] );
+ const __m256 fC = _mm256_set_ps( 0, 0, frustum[5][2], frustum[4][2], frustum[3][2], frustum[2][2], frustum[1][2], frustum[0][2] );
+ const __m256 fD = _mm256_set_ps( 0, 0, frustum[5][3], frustum[4][3], frustum[3][3], frustum[2][3], frustum[1][3], frustum[0][3] );
+ const __m256 eps = _mm256_set1_ps( epsilon );
+ static const __m256 epsM = _mm256_set1_ps( -epsilon );
+ const short mask6 = (1 << 6) - 1;
+
+ for ( int j = 0; j < numVerts; j++ ) {
+ idVec3 &vec = verts[j].xyz;
+ __m256 vX = _mm256_set1_ps( vec.x );
+ __m256 vY = _mm256_set1_ps( vec.y );
+ __m256 vZ = _mm256_set1_ps( vec.z );
+ __m256 d = _mm256_fmadd_ps( fA, vX,
+ _mm256_fmadd_ps( fB, vY,
+ _mm256_fmadd_ps( fC, vZ, fD )
+ )
+ );
+ int mask_lo = _mm256_movemask_ps( _mm256_cmp_ps( d, eps, _CMP_LT_OQ ) );
+ int mask_hi = _mm256_movemask_ps( _mm256_cmp_ps( d, eps, _CMP_GT_OQ ) );
+ pointCull[j] = (unsigned short)(mask_lo & mask6 | (mask_hi & mask6) << 6);
+ }
+ _mm256_zeroupper();
+}
+
+#endif
+
diff -Naurd a/math/Simd_AVX2.h b/math/Simd_AVX2.h
--- a/math/Simd_AVX2.h 1970-01-01 01:00:00.000000000 +0100
+++ b/math/Simd_AVX2.h 2024-11-08 21:03:16.431643700 +0100
@@ -0,0 +1,40 @@
+/*****************************************************************************
+ The Dark Mod GPL Source Code
+
+ This file is part of the The Dark Mod Source Code, originally based
+ on the Doom 3 GPL Source Code as published in 2011.
+
+ The Dark Mod Source Code is free software: you can redistribute it
+ and/or modify it under the terms of the GNU General Public License as
+ published by the Free Software Foundation, either version 3 of the License,
+ or (at your option) any later version. For details, see LICENSE.TXT.
+
+ Project: The Dark Mod (http://www.thedarkmod.com/)
+
+******************************************************************************/
+
+#ifndef __MATH_SIMD_AVX2_H__
+#define __MATH_SIMD_AVX2_H__
+
+#include "idlib/math/Simd_AVX.h"
+
+/*
+===============================================================================
+
+ AVX2 implementation of idSIMDProcessor
+
+===============================================================================
+*/
+
+class idSIMD_AVX2 : public idSIMD_AVX {
+public:
+#if defined(__GNUC__) && defined(__AVX__)
+ virtual const char *VPCALL GetName( void ) const;
+#elif defined(_MSC_VER) && defined(_M_IX86)
+ virtual const char *VPCALL GetName( void ) const;
+ virtual void VPCALL CullByFrustum( idDrawVert *verts, const int numVerts, const idPlane frustum[6], byte *pointCull, float epsilon );
+ virtual void VPCALL CullByFrustum2( idDrawVert *verts, const int numVerts, const idPlane frustum[6], unsigned short *pointCull, float epsilon );
+#endif
+};
+
+#endif /* !__MATH_SIMD_AVX2_H__ */
diff -Naurd a/math/Simd_Generic.cpp b/math/Simd_Generic.cpp
--- a/math/Simd_Generic.cpp 2024-10-29 03:55:07.000000000 +0100
+++ b/math/Simd_Generic.cpp 2024-11-15 15:59:04.591330500 +0100
@@ -33,7 +33,6 @@
#include "idlib/math/Plane.h"
#include "idlib/math/Matrix.h"
#include "renderer/Model.h"
-
#include "idlib/math/Simd_Generic.h"
//===============================================================
@@ -3069,3 +3068,44 @@
}
}
}
+
+/*
+============
+idSIMD_Generic::CullByFrustum
+
+Moved from R_CalcInteractionCullBits
+============
+*/
+void VPCALL idSIMD_Generic::CullByFrustum( idDrawVert *verts, const int numVerts, const idPlane frustum[6], byte *pointCull, float epsilon ) {
+ for ( int j = 0; j < numVerts; j++ ) {
+ idVec3 &vec = verts[j].xyz;
+ byte bits = 0;
+
+ for ( int i = 0; i < 6; i++ ) {
+ float d = frustum[i].Distance( vec );
+ bits |= ( d < epsilon ) << i;
+ }
+ pointCull[j] = bits;
+ }
+}
+
+/*
+============
+idSIMD_Generic::CullByFrustum2
+
+Moved from R_CalcPointCull
+============
+*/
+void VPCALL idSIMD_Generic::CullByFrustum2( idDrawVert *verts, const int numVerts, const idPlane frustum[6], unsigned short *pointCull, float epsilon ) {
+ for ( int j = 0; j < numVerts; j++ ) {
+ idVec3 &vec = verts[j].xyz;
+ short bits = 0;
+
+ for ( int i = 0; i < 6; i++ ) {
+ float d = frustum[i].Distance( vec );
+ bits |= ( d < epsilon ) << i;
+ bits |= ( d > -epsilon ) << ( i + 6 );
+ }
+ pointCull[j] = bits;
+ }
+}
diff -Naurd a/math/Simd_Generic.h b/math/Simd_Generic.h
--- a/math/Simd_Generic.h 2024-10-29 03:55:07.000000000 +0100
+++ b/math/Simd_Generic.h 2024-11-15 16:14:15.613412800 +0100
@@ -134,6 +134,9 @@
virtual void VPCALL MixSoundSixSpeakerMono( float *mixBuffer, const float *samples, const int numSamples, const float lastV[6], const float currentV[6] );
virtual void VPCALL MixSoundSixSpeakerStereo( float *mixBuffer, const float *samples, const int numSamples, const float lastV[6], const float currentV[6] );
virtual void VPCALL MixedSoundToSamples( short *samples, const float *mixBuffer, const int numSamples );
+
+ virtual void VPCALL CullByFrustum( idDrawVert *verts, const int numVerts, const idPlane frustum[6], byte *pointCull, float epsilon );
+ virtual void VPCALL CullByFrustum2( idDrawVert *verts, const int numVerts, const idPlane frustum[6], unsigned short *pointCull, float epsilon );
};
#endif /* !__MATH_SIMD_GENERIC_H__ */
diff -Naurd a/math/Simd_SSE.cpp b/math/Simd_SSE.cpp
--- a/math/Simd_SSE.cpp 2024-10-29 03:55:07.000000000 +0100
+++ b/math/Simd_SSE.cpp 2024-11-15 16:12:56.499128800 +0100
@@ -28,7 +28,6 @@
#include "sys/platform.h"
#include "idlib/geometry/DrawVert.h"
-
#include "idlib/math/Simd_SSE.h"
//===============================================================
@@ -18094,4 +18093,106 @@
#endif
}
+/*
+============
+idSIMD_SSE::CullByFrustum
+============
+*/
+void VPCALL idSIMD_SSE::CullByFrustum( idDrawVert *verts, const int numVerts, const idPlane frustum[6], byte *pointCull, float epsilon ) {
+ __m128 fA14 = _mm_set_ps( frustum[3][0], frustum[2][0], frustum[1][0], frustum[0][0] );
+ __m128 fA56 = _mm_set_ps( 0, 0, frustum[5][0], frustum[4][0] );
+ __m128 fB14 = _mm_set_ps( frustum[3][1], frustum[2][1], frustum[1][1], frustum[0][1] );
+ __m128 fB56 = _mm_set_ps( 0, 0, frustum[5][1], frustum[4][1] );
+ __m128 fC14 = _mm_set_ps( frustum[3][2], frustum[2][2], frustum[1][2], frustum[0][2] );
+ __m128 fC56 = _mm_set_ps( 0, 0, frustum[5][2], frustum[4][2] );
+ __m128 fD14 = _mm_set_ps( frustum[3][3], frustum[2][3], frustum[1][3], frustum[0][3] );
+ __m128 fD56 = _mm_set_ps( 0, 0, frustum[5][3], frustum[4][3] );
+
+ for ( int j = 0; j < numVerts; j++ ) {
+ idVec3 &vec = verts[j].xyz;
+ __m128 vX = _mm_set1_ps( vec.x );
+ __m128 vY = _mm_set1_ps( vec.y );
+ __m128 vZ = _mm_set1_ps( vec.z );
+ __m128 d14 = _mm_add_ps(
+ _mm_add_ps(
+ _mm_mul_ps( fA14, vX ),
+ _mm_mul_ps( fB14, vY )
+ ),
+ _mm_add_ps(
+ _mm_mul_ps( fC14, vZ ),
+ fD14
+ )
+ );
+ __m128 d56 = _mm_add_ps(
+ _mm_add_ps(
+ _mm_mul_ps( fA56, vX ),
+ _mm_mul_ps( fB56, vY )
+ ),
+ _mm_add_ps(
+ _mm_mul_ps( fC56, vZ ),
+ fD56
+ )
+ );
+ const short mask6 = ( 1 << 6 ) - 1;
+ __m128 eps = _mm_set1_ps( epsilon );
+ int mask_lo14 = _mm_movemask_ps( _mm_cmplt_ps( d14, eps ) );
+ int mask_lo56 = _mm_movemask_ps( _mm_cmplt_ps( d56, eps ) );
+ int mask_lo = mask_lo14 | mask_lo56 << 4;
+ pointCull[j] = mask_lo & mask6;
+ }
+}
+
+/*
+============
+idSIMD_SSE::CullByFrustum2
+============
+*/
+void VPCALL idSIMD_SSE::CullByFrustum2( idDrawVert *verts, const int numVerts, const idPlane frustum[6], unsigned short *pointCull, float epsilon ) {
+ __m128 fA14 = _mm_set_ps( frustum[3][0], frustum[2][0], frustum[1][0], frustum[0][0] );
+ __m128 fA56 = _mm_set_ps( 0, 0, frustum[5][0], frustum[4][0] );
+ __m128 fB14 = _mm_set_ps( frustum[3][1], frustum[2][1], frustum[1][1], frustum[0][1] );
+ __m128 fB56 = _mm_set_ps( 0, 0, frustum[5][1], frustum[4][1] );
+ __m128 fC14 = _mm_set_ps( frustum[3][2], frustum[2][2], frustum[1][2], frustum[0][2] );
+ __m128 fC56 = _mm_set_ps( 0, 0, frustum[5][2], frustum[4][2] );
+ __m128 fD14 = _mm_set_ps( frustum[3][3], frustum[2][3], frustum[1][3], frustum[0][3] );
+ __m128 fD56 = _mm_set_ps( 0, 0, frustum[5][3], frustum[4][3] );
+
+ for ( int j = 0; j < numVerts; j++ ) {
+ idVec3 &vec = verts[j].xyz;
+ __m128 vX = _mm_set1_ps( vec.x );
+ __m128 vY = _mm_set1_ps( vec.y );
+ __m128 vZ = _mm_set1_ps( vec.z );
+ __m128 d14 = _mm_add_ps(
+ _mm_add_ps(
+ _mm_mul_ps( fA14, vX ),
+ _mm_mul_ps( fB14, vY )
+ ),
+ _mm_add_ps(
+ _mm_mul_ps( fC14, vZ ),
+ fD14
+ )
+ );
+ __m128 d56 = _mm_add_ps(
+ _mm_add_ps(
+ _mm_mul_ps( fA56, vX ),
+ _mm_mul_ps( fB56, vY )
+ ),
+ _mm_add_ps(
+ _mm_mul_ps( fC56, vZ ),
+ fD56
+ )
+ );
+ const short mask6 = ( 1 << 6 ) - 1;
+ __m128 eps = _mm_set1_ps( epsilon );
+ int mask_lo14 = _mm_movemask_ps( _mm_cmplt_ps( d14, eps ) );
+ int mask_lo56 = _mm_movemask_ps( _mm_cmplt_ps( d56, eps ) );
+ eps = _mm_set1_ps( -epsilon );
+ int mask_hi14 = _mm_movemask_ps( _mm_cmpgt_ps( d14, eps ) );
+ int mask_hi56 = _mm_movemask_ps( _mm_cmpgt_ps( d56, eps ) );
+ int mask_lo = mask_lo14 | mask_lo56 << 4;
+ int mask_hi = mask_hi14 | mask_hi56 << 4;
+ pointCull[j] = mask_lo & mask6 | ( mask_hi & mask6 ) << 6;
+ }
+}
+
#endif /* _MSC_VER */
diff -Naurd a/math/Simd_SSE.h b/math/Simd_SSE.h
--- a/math/Simd_SSE.h 2024-10-29 03:55:07.000000000 +0100
+++ b/math/Simd_SSE.h 2024-11-15 16:08:15.865054000 +0100
@@ -143,6 +143,8 @@
virtual void VPCALL MixSoundSixSpeakerStereo( float *mixBuffer, const float *samples, const int numSamples, const float lastV[6], const float currentV[6] );
virtual void VPCALL MixedSoundToSamples( short *samples, const float *mixBuffer, const int numSamples );
+ virtual void VPCALL CullByFrustum( idDrawVert *verts, const int numVerts, const idPlane frustum[6], byte *pointCull, float epsilon );
+ virtual void VPCALL CullByFrustum2( idDrawVert *verts, const int numVerts, const idPlane frustum[6], unsigned short *pointCull, float epsilon );
#endif
};