diff -Naurd a/math/Simd.cpp b/math/Simd.cpp --- a/math/Simd.cpp 2024-10-29 03:55:07.000000000 +0100 +++ b/math/Simd.cpp 2024-11-15 16:12:56.489496800 +0100 @@ -43,18 +43,19 @@ #include "idlib/math/Simd_SSE.h" #include "idlib/math/Simd_SSE2.h" #include "idlib/math/Simd_SSE3.h" +#include "idlib/math/Simd_AVX.h" +#include "idlib/math/Simd_AVX2.h" #include "idlib/math/Simd_AltiVec.h" #include "idlib/math/Plane.h" #include "idlib/bv/Bounds.h" #include "idlib/Lib.h" #include "framework/Common.h" #include "renderer/Model.h" - #include "idlib/math/Simd.h" -idSIMDProcessor * processor = NULL; // pointer to SIMD processor -idSIMDProcessor * generic = NULL; // pointer to generic SIMD implementation -idSIMDProcessor * SIMDProcessor = NULL; +idSIMDProcessor ยจ*processor = NULL; // pointer to SIMD processor +idSIMDProcessor *generic = NULL; // pointer to generic SIMD implementation +idSIMDProcessor *SIMDProcessor = NULL; /* ================ @@ -74,36 +75,99 @@ ============ */ void idSIMD::InitProcessor( const char *module, bool forceGeneric ) { - int cpuid; - idSIMDProcessor *newProcessor; + int cpuid = idLib::sys->GetProcessorId(); - cpuid = idLib::sys->GetProcessorId(); + if ( processor != generic ) { + delete processor; + processor = NULL; + SIMDProcessor = generic; + } + idSIMDProcessor *newProcessor = NULL; - if ( forceGeneric ) { +// stgatilov: force cpuid bits for SIMD choice if compiler macros are set +// this is used for Elbrus compiler, which can cross-compile SSE intrinsics but has no CPUID instruction +#ifdef __MMX__ + cpuid |= CPUID_MMX; +#endif +#ifdef __3dNOW__ + cpuid |= CPUID_3DNOW; +#endif +#ifdef __SSE__ + cpuid |= CPUID_SSE; +#endif +#ifdef __SSE2__ + cpuid |= CPUID_SSE2; +#endif +#ifdef __SSE3__ + cpuid |= CPUID_SSE3; +#endif +#ifdef __SSE4_1__ + cpuid |= CPUID_SSE41; +#endif +#ifdef __AVX__ + cpuid |= CPUID_AVX; +#endif +#ifdef __AVX2__ + cpuid |= CPUID_AVX2; + cpuid |= CPUID_FMA3; +#endif +#ifdef __ALTIVEC__ + cpuid |= CPUID_ALTIVEC; +#endif - newProcessor = generic; + // Print what we found to console + idLib::common->Printf( "Found %s CPU, With these features: %s %s %s %s %s %s %s %s %s %s %s\n", + // Vendor + cpuid & CPUID_AMD ? "AMD" : + cpuid & CPUID_INTEL ? "Intel" : + cpuid & CPUID_GENERIC ? "Generic" : + "Unsupported", + // Flags + cpuid & CPUID_MMX ? " MMX" : "", + cpuid & CPUID_3DNOW ? " 3DNow" : "", + cpuid & CPUID_SSE ? " SSE" : "", + cpuid & CPUID_SSE2 ? " SSE2" : "", + cpuid & CPUID_SSE3 ? " SSE3" : "", + cpuid & CPUID_SSE41 ? " SSE41" : "", + cpuid & CPUID_AVX ? " AVX" : "", + cpuid & CPUID_AVX2 ? " AVX2" : "", + cpuid & CPUID_FMA3 ? " FMA3" : "", + cpuid & CPUID_ALTIVEC ? " ALTIVEC" : "" ); + if ( forceGeneric ) { + newProcessor = generic; } else { - if ( !processor ) { - if ( ( cpuid & CPUID_ALTIVEC ) ) { + bool upToMMX = ( cpuid & CPUID_MMX ); + bool upTo3DNow = upToMMX && ( cpuid & CPUID_3DNOW ) && ( cpuid & CPUID_AMD ); // newer AMD processors no longer support this. + bool upToSSE = upToMMX && ( cpuid & CPUID_SSE ); + bool upToSSE2 = upToSSE && ( cpuid & CPUID_SSE2 ); + bool upToSSE3 = upToSSE2 && ( cpuid & CPUID_SSE3 ); + bool upToAVX = upToSSE3 && ( cpuid & CPUID_AVX ); + bool upToAVX2 = upToAVX && ( cpuid & CPUID_AVX2 ) && ( cpuid & CPUID_FMA3 ); + bool isAlTiVec = ( cpuid & CPUID_ALTIVEC ); // unused on anything but apple i think... + + if ( isAlTiVec ) { processor = new idSIMD_AltiVec; - } else if ( ( cpuid & CPUID_MMX ) && ( cpuid & CPUID_SSE ) && ( cpuid & CPUID_SSE2 ) && ( cpuid & CPUID_SSE3 ) ) { + } else if ( upToAVX2 ) { + processor = new idSIMD_AVX2; + } else if ( upToAVX ) { + processor = new idSIMD_AVX; + } else if ( upToSSE3 ) { processor = new idSIMD_SSE3; - } else if ( ( cpuid & CPUID_MMX ) && ( cpuid & CPUID_SSE ) && ( cpuid & CPUID_SSE2 ) ) { + } else if ( upToSSE2 ) { processor = new idSIMD_SSE2; - } else if ( ( cpuid & CPUID_MMX ) && ( cpuid & CPUID_SSE ) ) { + } else if ( upToSSE ) { processor = new idSIMD_SSE; - } else if ( ( cpuid & CPUID_MMX ) && ( cpuid & CPUID_3DNOW ) ) { + } else if ( upTo3DNow ) { processor = new idSIMD_3DNow; - } else if ( ( cpuid & CPUID_MMX ) ) { + } else if ( upToMMX ) { processor = new idSIMD_MMX; } else { processor = generic; } processor->cpuid = cpuid; } - newProcessor = processor; } @@ -112,7 +176,7 @@ idLib::common->Printf( "%s using %s for SIMD processing\n", module, SIMDProcessor->GetName() ); } - if ( cpuid & CPUID_SSE ) { + if ( cpuid & CPUID_SSE2 ) { idLib::sys->FPU_SetFTZ( true ); idLib::sys->FPU_SetDAZ( true ); } @@ -4038,6 +4102,18 @@ return; } p_simd = new idSIMD_SSE3(); + } else if ( idStr::Icmp( argString, "AVX" ) == 0 ) { + if ( !( cpuid & CPUID_SSE ) || !( cpuid & CPUID_SSE2 ) || !( cpuid & CPUID_SSE3 ) || !( cpuid & CPUID_AVX ) ) { + common->Printf( "CPU does not support SSE* & AVX\n" ); + return; + } + p_simd = new idSIMD_AVX(); + } else if ( idStr::Icmp( argString, "AVX2" ) == 0 ) { + if ( !( cpuid & CPUID_SSE ) || !( cpuid & CPUID_SSE2 ) || !( cpuid & CPUID_SSE3 ) || !( cpuid & CPUID_AVX ) || !( cpuid & CPUID_AVX2 ) || !( cpuid & CPUID_FMA3 ) ) { + common->Printf( "CPU does not support SSE* & AVX & AVX2 & FMA3\n" ); + return; + } + p_simd = new idSIMD_AVX2(); } else if ( idStr::Icmp( argString, "AltiVec" ) == 0 ) { if ( !( cpuid & CPUID_ALTIVEC ) ) { common->Printf( "CPU does not support AltiVec\n" ); diff -Naurd a/math/Simd.h b/math/Simd.h --- a/math/Simd.h 2024-10-29 03:55:07.000000000 +0100 +++ b/math/Simd.h 2024-11-10 17:54:55.405941100 +0100 @@ -45,12 +45,11 @@ class idSIMD { public: static void Init( void ); - static void InitProcessor( const char *module, bool forceGeneric ); + static void InitProcessor( const char *module, bool forceGeneric = false ); static void Shutdown( void ); static void Test_f( const class idCmdArgs &args ); }; - /* =============================================================================== @@ -94,11 +93,10 @@ SPEAKER_BACKRIGHT } speakerLabel; - class idSIMDProcessor { public: idSIMDProcessor( void ) { cpuid = CPUID_NONE; } - virtual ~idSIMDProcessor() { }; + virtual ~idSIMDProcessor() {}; int cpuid; @@ -199,6 +197,10 @@ virtual void VPCALL MixSoundSixSpeakerMono( float *mixBuffer, const float *samples, const int numSamples, const float lastV[6], const float currentV[6] ) = 0; virtual void VPCALL MixSoundSixSpeakerStereo( float *mixBuffer, const float *samples, const int numSamples, const float lastV[6], const float currentV[6] ) = 0; virtual void VPCALL MixedSoundToSamples( short *samples, const float *mixBuffer, const int numSamples ) = 0; + + // plane culling + virtual void VPCALL CullByFrustum( idDrawVert *verts, const int numVerts, const idPlane frustum[6], byte *pointCull, float epsilon ) = 0; + virtual void VPCALL CullByFrustum2( idDrawVert *verts, const int numVerts, const idPlane frustum[6], unsigned short *pointCull, float epsilon ) = 0; }; // pointer to SIMD processor diff -Naurd a/math/Simd_AVX.cpp b/math/Simd_AVX.cpp --- a/math/Simd_AVX.cpp 1970-01-01 01:00:00.000000000 +0100 +++ b/math/Simd_AVX.cpp 2024-11-15 15:40:03.886268200 +0100 @@ -0,0 +1,139 @@ +/* +=========================================================================== + +Doom 3 GPL Source Code +Copyright (C) 1999-2011 id Software LLC, a ZeniMax Media company. + +This file is part of the Doom 3 GPL Source Code ("Doom 3 Source Code"). + +Doom 3 Source Code is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +Doom 3 Source Code is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with Doom 3 Source Code. If not, see . + +In addition, the Doom 3 Source Code is also subject to certain additional terms. You should have received a copy of these additional terms immediately following the terms and conditions of the GNU General Public License which accompanied the Doom 3 Source Code. If not, please request a copy in writing from id Software at the address below. + +If you have questions concerning this license or the applicable additional terms, you may contact in writing id Software LLC, c/o ZeniMax Media Inc., Suite 120, Rockville, Maryland 20850 USA. + +=========================================================================== +*/ + +#include "sys/platform.h" +#include "Simd_AVX.h" + +//=============================================================== +// +// AVX implementation of idSIMDProcessor +// +//=============================================================== + +#if defined(__GNUC__) && defined(__SSE3__) + +/* +============ +idSIMD_SSE3::GetName +============ +*/ +const char *idSIMD_AVX::GetName( void ) const { + return "MMX & SSE & SSE2 & SSE3 & AVX"; +} + +#elif defined(_MSC_VER) && defined(_M_IX86) + +#include + +#include "idlib/geometry/DrawVert.h" +#include "idlib/geometry/JointTransform.h" +#include "idlib/math/Vector.h" +#include "idlib/math/Plane.h" + +/* +============ +idSIMD_AVX::GetName +============ +*/ +const char *idSIMD_AVX::GetName( void ) const { + return "MMX & SSE & SSE2 & SSE3 & AVX"; +} + +/* +============ +idSIMD_AVX::CullByFrustum +============ +*/ +void VPCALL idSIMD_AVX::CullByFrustum( idDrawVert *verts, const int numVerts, const idPlane frustum[6], byte *pointCull, float epsilon ) { + const __m256 fA = _mm256_set_ps( 0, 0, frustum[5][0], frustum[4][0], frustum[3][0], frustum[2][0], frustum[1][0], frustum[0][0] ); + const __m256 fB = _mm256_set_ps( 0, 0, frustum[5][1], frustum[4][1], frustum[3][1], frustum[2][1], frustum[1][1], frustum[0][1] ); + const __m256 fC = _mm256_set_ps( 0, 0, frustum[5][2], frustum[4][2], frustum[3][2], frustum[2][2], frustum[1][2], frustum[0][2] ); + const __m256 fD = _mm256_set_ps( 0, 0, frustum[5][3], frustum[4][3], frustum[3][3], frustum[2][3], frustum[1][3], frustum[0][3] ); + const __m256 eps = _mm256_set1_ps( epsilon ); + const byte mask6 = ( 1 << 6 ) - 1; + + for( int j = 0; j < numVerts; j++ ) { + idVec3 &vec = verts[j].xyz; + __m256 vX = _mm256_set1_ps( vec.x ); + __m256 vY = _mm256_set1_ps( vec.y ); + __m256 vZ = _mm256_set1_ps( vec.z ); + __m256 d = + _mm256_add_ps( + _mm256_add_ps( + _mm256_mul_ps( fA, vX ), + _mm256_mul_ps( fB, vY ) + ), + _mm256_add_ps( + _mm256_mul_ps( fC, vZ ), + fD + ) + ); + int mask_lo = _mm256_movemask_ps( _mm256_cmp_ps( d, eps, _CMP_LT_OQ ) ); + pointCull[j] = ( byte )mask_lo & mask6; + } + _mm256_zeroupper(); +} + +/* +============ +idSIMD_AVX::CullByFrustum2 +============ +*/ +void VPCALL idSIMD_AVX::CullByFrustum2( idDrawVert *verts, const int numVerts, const idPlane frustum[6], unsigned short *pointCull, float epsilon ) { + const __m256 fA = _mm256_set_ps( 0, 0, frustum[5][0], frustum[4][0], frustum[3][0], frustum[2][0], frustum[1][0], frustum[0][0] ); + const __m256 fB = _mm256_set_ps( 0, 0, frustum[5][1], frustum[4][1], frustum[3][1], frustum[2][1], frustum[1][1], frustum[0][1] ); + const __m256 fC = _mm256_set_ps( 0, 0, frustum[5][2], frustum[4][2], frustum[3][2], frustum[2][2], frustum[1][2], frustum[0][2] ); + const __m256 fD = _mm256_set_ps( 0, 0, frustum[5][3], frustum[4][3], frustum[3][3], frustum[2][3], frustum[1][3], frustum[0][3] ); + const __m256 eps = _mm256_set1_ps( epsilon ); + static const __m256 epsM = _mm256_set1_ps( -epsilon ); + const short mask6 = ( 1 << 6 ) - 1; + + for( int j = 0; j < numVerts; j++ ) { + idVec3 &vec = verts[j].xyz; + __m256 vX = _mm256_set1_ps( vec.x ); + __m256 vY = _mm256_set1_ps( vec.y ); + __m256 vZ = _mm256_set1_ps( vec.z ); + __m256 d = + _mm256_add_ps( + _mm256_add_ps( + _mm256_mul_ps( fA, vX ), + _mm256_mul_ps( fB, vY ) + ), + _mm256_add_ps( + _mm256_mul_ps( fC, vZ ), + fD + ) + ); + int mask_lo = _mm256_movemask_ps( _mm256_cmp_ps( d, eps, _CMP_LT_OQ ) ); + int mask_hi = _mm256_movemask_ps( _mm256_cmp_ps( d, eps, _CMP_GT_OQ ) ); + pointCull[j] = ( unsigned short )( mask_lo & mask6 | ( mask_hi & mask6 ) << 6 ); + } + _mm256_zeroupper(); +} + +#endif diff -Naurd a/math/Simd_AVX.h b/math/Simd_AVX.h --- a/math/Simd_AVX.h 1970-01-01 01:00:00.000000000 +0100 +++ b/math/Simd_AVX.h 2024-11-08 21:03:16.820910100 +0100 @@ -0,0 +1,40 @@ +/***************************************************************************** + The Dark Mod GPL Source Code + + This file is part of the The Dark Mod Source Code, originally based + on the Doom 3 GPL Source Code as published in 2011. + + The Dark Mod Source Code is free software: you can redistribute it + and/or modify it under the terms of the GNU General Public License as + published by the Free Software Foundation, either version 3 of the License, + or (at your option) any later version. For details, see LICENSE.TXT. + + Project: The Dark Mod (http://www.thedarkmod.com/) + +******************************************************************************/ + +#ifndef __MATH_SIMD_AVX_H__ +#define __MATH_SIMD_AVX_H__ + +#include "idlib/math/Simd_SSE3.h" + +/* +=============================================================================== + + AVX implementation of idSIMDProcessor + +=============================================================================== +*/ + +class idSIMD_AVX : public idSIMD_SSE3 { +public: +#if defined(__GNUC__) && defined(__AVX__) + virtual const char *VPCALL GetName( void ) const; +#elif defined(_MSC_VER) && defined(_M_IX86) + virtual const char *VPCALL GetName( void ) const; + virtual void VPCALL CullByFrustum( idDrawVert *verts, const int numVerts, const idPlane frustum[6], byte *pointCull, float epsilon ); + virtual void VPCALL CullByFrustum2( idDrawVert *verts, const int numVerts, const idPlane frustum[6], unsigned short *pointCull, float epsilon ); +#endif +}; + +#endif /* !__MATH_SIMD_AVX_H__ */ diff -Naurd a/math/Simd_AVX2.cpp b/math/Simd_AVX2.cpp --- a/math/Simd_AVX2.cpp 1970-01-01 01:00:00.000000000 +0100 +++ b/math/Simd_AVX2.cpp 2024-11-15 15:40:03.886268200 +0100 @@ -0,0 +1,128 @@ +/* +=========================================================================== + +Doom 3 GPL Source Code +Copyright (C) 1999-2011 id Software LLC, a ZeniMax Media company. + +This file is part of the Doom 3 GPL Source Code ("Doom 3 Source Code"). + +Doom 3 Source Code is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +Doom 3 Source Code is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with Doom 3 Source Code. If not, see . + +In addition, the Doom 3 Source Code is also subject to certain additional terms. You should have received a copy of these additional terms immediately following the terms and conditions of the GNU General Public License which accompanied the Doom 3 Source Code. If not, please request a copy in writing from id Software at the address below. + +If you have questions concerning this license or the applicable additional terms, you may contact in writing id Software LLC, c/o ZeniMax Media Inc., Suite 120, Rockville, Maryland 20850 USA. + +=========================================================================== +*/ + +#include "sys/platform.h" +#include "Simd_AVX2.h" + +//=============================================================== +// +// AVX implementation of idSIMDProcessor +// +//=============================================================== + +#if defined(__GNUC__) && defined(__SSE3__) + +/* +============ +idSIMD_SSE3::GetName +============ +*/ +const char *idSIMD_AVX:2:GetName( void ) const { + return "MMX & SSE & SSE2 & SSE3 & AVX & AVX2"; +} + +#elif defined(_MSC_VER) && defined(_M_IX86) + +#include + +#include "idlib/geometry/DrawVert.h" +#include "idlib/geometry/JointTransform.h" +#include "idlib/math/Vector.h" +#include "idlib/math/Plane.h" + +/* +============ +idSIMD_AVX2::GetName +============ +*/ +const char *idSIMD_AVX2::GetName( void ) const { + return "MMX & SSE & SSE2 & SSE3 & AVX & AVX2"; +} + +/* +============ +idSIMD_AVX2::CullByFrustum +============ +*/ +void VPCALL idSIMD_AVX2::CullByFrustum( idDrawVert *verts, const int numVerts, const idPlane frustum[6], byte *pointCull, float epsilon ) { + const __m256 fA = _mm256_set_ps( 0, 0, frustum[5][0], frustum[4][0], frustum[3][0], frustum[2][0], frustum[1][0], frustum[0][0] ); + const __m256 fB = _mm256_set_ps( 0, 0, frustum[5][1], frustum[4][1], frustum[3][1], frustum[2][1], frustum[1][1], frustum[0][1] ); + const __m256 fC = _mm256_set_ps( 0, 0, frustum[5][2], frustum[4][2], frustum[3][2], frustum[2][2], frustum[1][2], frustum[0][2] ); + const __m256 fD = _mm256_set_ps( 0, 0, frustum[5][3], frustum[4][3], frustum[3][3], frustum[2][3], frustum[1][3], frustum[0][3] ); + const __m256 eps = _mm256_set1_ps( epsilon ); + const byte mask6 = (1 << 6) - 1; + + for ( int j = 0; j < numVerts; j++ ) { + idVec3 &vec = verts[j].xyz; + __m256 vX = _mm256_set1_ps( vec.x ); + __m256 vY = _mm256_set1_ps( vec.y ); + __m256 vZ = _mm256_set1_ps( vec.z ); + __m256 d = _mm256_fmadd_ps( fA, vX, + _mm256_fmadd_ps( fB, vY, + _mm256_fmadd_ps( fC, vZ, fD ) + ) + ); + int mask_lo = _mm256_movemask_ps( _mm256_cmp_ps( d, eps, _CMP_LT_OQ ) ); + pointCull[j] = (byte)mask_lo & mask6; + } + _mm256_zeroupper(); +} + +/* +============ +idSIMD_AVX2::CullByFrustum2 +============ +*/ +void VPCALL idSIMD_AVX2::CullByFrustum2( idDrawVert *verts, const int numVerts, const idPlane frustum[6], unsigned short *pointCull, float epsilon ) { + const __m256 fA = _mm256_set_ps( 0, 0, frustum[5][0], frustum[4][0], frustum[3][0], frustum[2][0], frustum[1][0], frustum[0][0] ); + const __m256 fB = _mm256_set_ps( 0, 0, frustum[5][1], frustum[4][1], frustum[3][1], frustum[2][1], frustum[1][1], frustum[0][1] ); + const __m256 fC = _mm256_set_ps( 0, 0, frustum[5][2], frustum[4][2], frustum[3][2], frustum[2][2], frustum[1][2], frustum[0][2] ); + const __m256 fD = _mm256_set_ps( 0, 0, frustum[5][3], frustum[4][3], frustum[3][3], frustum[2][3], frustum[1][3], frustum[0][3] ); + const __m256 eps = _mm256_set1_ps( epsilon ); + static const __m256 epsM = _mm256_set1_ps( -epsilon ); + const short mask6 = (1 << 6) - 1; + + for ( int j = 0; j < numVerts; j++ ) { + idVec3 &vec = verts[j].xyz; + __m256 vX = _mm256_set1_ps( vec.x ); + __m256 vY = _mm256_set1_ps( vec.y ); + __m256 vZ = _mm256_set1_ps( vec.z ); + __m256 d = _mm256_fmadd_ps( fA, vX, + _mm256_fmadd_ps( fB, vY, + _mm256_fmadd_ps( fC, vZ, fD ) + ) + ); + int mask_lo = _mm256_movemask_ps( _mm256_cmp_ps( d, eps, _CMP_LT_OQ ) ); + int mask_hi = _mm256_movemask_ps( _mm256_cmp_ps( d, eps, _CMP_GT_OQ ) ); + pointCull[j] = (unsigned short)(mask_lo & mask6 | (mask_hi & mask6) << 6); + } + _mm256_zeroupper(); +} + +#endif + diff -Naurd a/math/Simd_AVX2.h b/math/Simd_AVX2.h --- a/math/Simd_AVX2.h 1970-01-01 01:00:00.000000000 +0100 +++ b/math/Simd_AVX2.h 2024-11-08 21:03:16.431643700 +0100 @@ -0,0 +1,40 @@ +/***************************************************************************** + The Dark Mod GPL Source Code + + This file is part of the The Dark Mod Source Code, originally based + on the Doom 3 GPL Source Code as published in 2011. + + The Dark Mod Source Code is free software: you can redistribute it + and/or modify it under the terms of the GNU General Public License as + published by the Free Software Foundation, either version 3 of the License, + or (at your option) any later version. For details, see LICENSE.TXT. + + Project: The Dark Mod (http://www.thedarkmod.com/) + +******************************************************************************/ + +#ifndef __MATH_SIMD_AVX2_H__ +#define __MATH_SIMD_AVX2_H__ + +#include "idlib/math/Simd_AVX.h" + +/* +=============================================================================== + + AVX2 implementation of idSIMDProcessor + +=============================================================================== +*/ + +class idSIMD_AVX2 : public idSIMD_AVX { +public: +#if defined(__GNUC__) && defined(__AVX__) + virtual const char *VPCALL GetName( void ) const; +#elif defined(_MSC_VER) && defined(_M_IX86) + virtual const char *VPCALL GetName( void ) const; + virtual void VPCALL CullByFrustum( idDrawVert *verts, const int numVerts, const idPlane frustum[6], byte *pointCull, float epsilon ); + virtual void VPCALL CullByFrustum2( idDrawVert *verts, const int numVerts, const idPlane frustum[6], unsigned short *pointCull, float epsilon ); +#endif +}; + +#endif /* !__MATH_SIMD_AVX2_H__ */ diff -Naurd a/math/Simd_Generic.cpp b/math/Simd_Generic.cpp --- a/math/Simd_Generic.cpp 2024-10-29 03:55:07.000000000 +0100 +++ b/math/Simd_Generic.cpp 2024-11-15 15:59:04.591330500 +0100 @@ -33,7 +33,6 @@ #include "idlib/math/Plane.h" #include "idlib/math/Matrix.h" #include "renderer/Model.h" - #include "idlib/math/Simd_Generic.h" //=============================================================== @@ -3069,3 +3068,44 @@ } } } + +/* +============ +idSIMD_Generic::CullByFrustum + +Moved from R_CalcInteractionCullBits +============ +*/ +void VPCALL idSIMD_Generic::CullByFrustum( idDrawVert *verts, const int numVerts, const idPlane frustum[6], byte *pointCull, float epsilon ) { + for ( int j = 0; j < numVerts; j++ ) { + idVec3 &vec = verts[j].xyz; + byte bits = 0; + + for ( int i = 0; i < 6; i++ ) { + float d = frustum[i].Distance( vec ); + bits |= ( d < epsilon ) << i; + } + pointCull[j] = bits; + } +} + +/* +============ +idSIMD_Generic::CullByFrustum2 + +Moved from R_CalcPointCull +============ +*/ +void VPCALL idSIMD_Generic::CullByFrustum2( idDrawVert *verts, const int numVerts, const idPlane frustum[6], unsigned short *pointCull, float epsilon ) { + for ( int j = 0; j < numVerts; j++ ) { + idVec3 &vec = verts[j].xyz; + short bits = 0; + + for ( int i = 0; i < 6; i++ ) { + float d = frustum[i].Distance( vec ); + bits |= ( d < epsilon ) << i; + bits |= ( d > -epsilon ) << ( i + 6 ); + } + pointCull[j] = bits; + } +} diff -Naurd a/math/Simd_Generic.h b/math/Simd_Generic.h --- a/math/Simd_Generic.h 2024-10-29 03:55:07.000000000 +0100 +++ b/math/Simd_Generic.h 2024-11-15 16:14:15.613412800 +0100 @@ -134,6 +134,9 @@ virtual void VPCALL MixSoundSixSpeakerMono( float *mixBuffer, const float *samples, const int numSamples, const float lastV[6], const float currentV[6] ); virtual void VPCALL MixSoundSixSpeakerStereo( float *mixBuffer, const float *samples, const int numSamples, const float lastV[6], const float currentV[6] ); virtual void VPCALL MixedSoundToSamples( short *samples, const float *mixBuffer, const int numSamples ); + + virtual void VPCALL CullByFrustum( idDrawVert *verts, const int numVerts, const idPlane frustum[6], byte *pointCull, float epsilon ); + virtual void VPCALL CullByFrustum2( idDrawVert *verts, const int numVerts, const idPlane frustum[6], unsigned short *pointCull, float epsilon ); }; #endif /* !__MATH_SIMD_GENERIC_H__ */ diff -Naurd a/math/Simd_SSE.cpp b/math/Simd_SSE.cpp --- a/math/Simd_SSE.cpp 2024-10-29 03:55:07.000000000 +0100 +++ b/math/Simd_SSE.cpp 2024-11-15 16:12:56.499128800 +0100 @@ -28,7 +28,6 @@ #include "sys/platform.h" #include "idlib/geometry/DrawVert.h" - #include "idlib/math/Simd_SSE.h" //=============================================================== @@ -18094,4 +18093,106 @@ #endif } +/* +============ +idSIMD_SSE::CullByFrustum +============ +*/ +void VPCALL idSIMD_SSE::CullByFrustum( idDrawVert *verts, const int numVerts, const idPlane frustum[6], byte *pointCull, float epsilon ) { + __m128 fA14 = _mm_set_ps( frustum[3][0], frustum[2][0], frustum[1][0], frustum[0][0] ); + __m128 fA56 = _mm_set_ps( 0, 0, frustum[5][0], frustum[4][0] ); + __m128 fB14 = _mm_set_ps( frustum[3][1], frustum[2][1], frustum[1][1], frustum[0][1] ); + __m128 fB56 = _mm_set_ps( 0, 0, frustum[5][1], frustum[4][1] ); + __m128 fC14 = _mm_set_ps( frustum[3][2], frustum[2][2], frustum[1][2], frustum[0][2] ); + __m128 fC56 = _mm_set_ps( 0, 0, frustum[5][2], frustum[4][2] ); + __m128 fD14 = _mm_set_ps( frustum[3][3], frustum[2][3], frustum[1][3], frustum[0][3] ); + __m128 fD56 = _mm_set_ps( 0, 0, frustum[5][3], frustum[4][3] ); + + for ( int j = 0; j < numVerts; j++ ) { + idVec3 &vec = verts[j].xyz; + __m128 vX = _mm_set1_ps( vec.x ); + __m128 vY = _mm_set1_ps( vec.y ); + __m128 vZ = _mm_set1_ps( vec.z ); + __m128 d14 = _mm_add_ps( + _mm_add_ps( + _mm_mul_ps( fA14, vX ), + _mm_mul_ps( fB14, vY ) + ), + _mm_add_ps( + _mm_mul_ps( fC14, vZ ), + fD14 + ) + ); + __m128 d56 = _mm_add_ps( + _mm_add_ps( + _mm_mul_ps( fA56, vX ), + _mm_mul_ps( fB56, vY ) + ), + _mm_add_ps( + _mm_mul_ps( fC56, vZ ), + fD56 + ) + ); + const short mask6 = ( 1 << 6 ) - 1; + __m128 eps = _mm_set1_ps( epsilon ); + int mask_lo14 = _mm_movemask_ps( _mm_cmplt_ps( d14, eps ) ); + int mask_lo56 = _mm_movemask_ps( _mm_cmplt_ps( d56, eps ) ); + int mask_lo = mask_lo14 | mask_lo56 << 4; + pointCull[j] = mask_lo & mask6; + } +} + +/* +============ +idSIMD_SSE::CullByFrustum2 +============ +*/ +void VPCALL idSIMD_SSE::CullByFrustum2( idDrawVert *verts, const int numVerts, const idPlane frustum[6], unsigned short *pointCull, float epsilon ) { + __m128 fA14 = _mm_set_ps( frustum[3][0], frustum[2][0], frustum[1][0], frustum[0][0] ); + __m128 fA56 = _mm_set_ps( 0, 0, frustum[5][0], frustum[4][0] ); + __m128 fB14 = _mm_set_ps( frustum[3][1], frustum[2][1], frustum[1][1], frustum[0][1] ); + __m128 fB56 = _mm_set_ps( 0, 0, frustum[5][1], frustum[4][1] ); + __m128 fC14 = _mm_set_ps( frustum[3][2], frustum[2][2], frustum[1][2], frustum[0][2] ); + __m128 fC56 = _mm_set_ps( 0, 0, frustum[5][2], frustum[4][2] ); + __m128 fD14 = _mm_set_ps( frustum[3][3], frustum[2][3], frustum[1][3], frustum[0][3] ); + __m128 fD56 = _mm_set_ps( 0, 0, frustum[5][3], frustum[4][3] ); + + for ( int j = 0; j < numVerts; j++ ) { + idVec3 &vec = verts[j].xyz; + __m128 vX = _mm_set1_ps( vec.x ); + __m128 vY = _mm_set1_ps( vec.y ); + __m128 vZ = _mm_set1_ps( vec.z ); + __m128 d14 = _mm_add_ps( + _mm_add_ps( + _mm_mul_ps( fA14, vX ), + _mm_mul_ps( fB14, vY ) + ), + _mm_add_ps( + _mm_mul_ps( fC14, vZ ), + fD14 + ) + ); + __m128 d56 = _mm_add_ps( + _mm_add_ps( + _mm_mul_ps( fA56, vX ), + _mm_mul_ps( fB56, vY ) + ), + _mm_add_ps( + _mm_mul_ps( fC56, vZ ), + fD56 + ) + ); + const short mask6 = ( 1 << 6 ) - 1; + __m128 eps = _mm_set1_ps( epsilon ); + int mask_lo14 = _mm_movemask_ps( _mm_cmplt_ps( d14, eps ) ); + int mask_lo56 = _mm_movemask_ps( _mm_cmplt_ps( d56, eps ) ); + eps = _mm_set1_ps( -epsilon ); + int mask_hi14 = _mm_movemask_ps( _mm_cmpgt_ps( d14, eps ) ); + int mask_hi56 = _mm_movemask_ps( _mm_cmpgt_ps( d56, eps ) ); + int mask_lo = mask_lo14 | mask_lo56 << 4; + int mask_hi = mask_hi14 | mask_hi56 << 4; + pointCull[j] = mask_lo & mask6 | ( mask_hi & mask6 ) << 6; + } +} + #endif /* _MSC_VER */ diff -Naurd a/math/Simd_SSE.h b/math/Simd_SSE.h --- a/math/Simd_SSE.h 2024-10-29 03:55:07.000000000 +0100 +++ b/math/Simd_SSE.h 2024-11-15 16:08:15.865054000 +0100 @@ -143,6 +143,8 @@ virtual void VPCALL MixSoundSixSpeakerStereo( float *mixBuffer, const float *samples, const int numSamples, const float lastV[6], const float currentV[6] ); virtual void VPCALL MixedSoundToSamples( short *samples, const float *mixBuffer, const int numSamples ); + virtual void VPCALL CullByFrustum( idDrawVert *verts, const int numVerts, const idPlane frustum[6], byte *pointCull, float epsilon ); + virtual void VPCALL CullByFrustum2( idDrawVert *verts, const int numVerts, const idPlane frustum[6], unsigned short *pointCull, float epsilon ); #endif };