From d25345aa07ef7ca19b02c545642b868f28be565d Mon Sep 17 00:00:00 2001 From: scorpioblood <77296181+scorpioblood@users.noreply.github.com> Date: Tue, 28 May 2024 22:01:51 +0200 Subject: [PATCH] More SIMD. --- .../Core/public/Math/SIMD/PhanesSIMDTypes.h | 2 + .../public/Math/SIMD/PhanesVectorMathSSE.hpp | 74 ++++++++++++- .../Runtime/Core/public/Math/SIMD/Platform.h | 24 +++- .../Runtime/Core/public/Math/Vector4.hpp | 12 +- .../Runtime/Core/public/Math/Vector4.inl | 103 +++++++++++++++++- 5 files changed, 193 insertions(+), 22 deletions(-) diff --git a/Engine/Source/Runtime/Core/public/Math/SIMD/PhanesSIMDTypes.h b/Engine/Source/Runtime/Core/public/Math/SIMD/PhanesSIMDTypes.h index b614e0c..6ff9116 100644 --- a/Engine/Source/Runtime/Core/public/Math/SIMD/PhanesSIMDTypes.h +++ b/Engine/Source/Runtime/Core/public/Math/SIMD/PhanesSIMDTypes.h @@ -1,3 +1,5 @@ +#pragma once + // This file includes the necessary header for vectorization intrinsics. If no specifics are defined SSE4.2 is used. // // ARM is not supported. diff --git a/Engine/Source/Runtime/Core/public/Math/SIMD/PhanesVectorMathSSE.hpp b/Engine/Source/Runtime/Core/public/Math/SIMD/PhanesVectorMathSSE.hpp index 5901cb3..f0815c7 100644 --- a/Engine/Source/Runtime/Core/public/Math/SIMD/PhanesVectorMathSSE.hpp +++ b/Engine/Source/Runtime/Core/public/Math/SIMD/PhanesVectorMathSSE.hpp @@ -1,10 +1,30 @@ #pragma once -#include "Core/public/Math/Boilerplate.h" #include <nmmintrin.h> +#include "Core/public/Math/SIMD/PhanesSIMDTypes.h" +#include "Core/public/Math/Boilerplate.h" +#include "Core/public/Math/MathCommon.hpp" + +#include <iostream> + // -> For IntelliSense +#include "Core/public/Math/Vector4.hpp" + +// ========== // +// Common // +// ========== // + + +Phanes::Core::Types::Vec4f32Reg p_vec4_abs(const Phanes::Core::Types::Vec4f32Reg& v) +{ + return _mm_and_ps(v, _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF))); +} + + + + // ============ // // TVector4 // // ============ // @@ -22,4 +42,54 @@ namespace Phanes::Core::Math::Detail r.comp = _mm_add_ps(v1.comp, v2.comp); } }; -} \ No newline at end of file + + template<> + struct compute_vec4_sub<float, true> + { + static FORCEINLINE void map(Phanes::Core::Math::TVector4<float, true>& r, const Phanes::Core::Math::TVector4<float, true>& v1, const Phanes::Core::Math::TVector4<float, true>& v2) + { + r.comp = _mm_sub_ps(v1.comp, v2.comp); + } + }; + + template<> + struct compute_vec4_mul<float, true> + { + static FORCEINLINE void map(Phanes::Core::Math::TVector4<float, true>& r, const Phanes::Core::Math::TVector4<float, true>& v1, const Phanes::Core::Math::TVector4<float, true>& v2) + { + r.comp = _mm_mul_ps(v1.comp, v2.comp); + } + }; + + template<> + struct compute_vec4_div<float, true> + { + static FORCEINLINE void map(Phanes::Core::Math::TVector4<float, true>& r, const Phanes::Core::Math::TVector4<float, true>& v1, const Phanes::Core::Math::TVector4<float, true>& v2) + { + r.comp = _mm_div_ps(v1.comp, v2.comp); + } + }; + + template<> + struct compute_vec4_eq<float, true> + { + static FORCEINLINE bool map(const Phanes::Core::Math::TVector4<float, true>& v1, const Phanes::Core::Math::TVector4<float, true>& v2) + { + float r; + _mm_store_ps1(&r, _mm_cmpeq_ps(v1.comp, v2.comp)); + return (r == 0xffffffff) ? true : false; + } + }; + + template<> + struct compute_vec4_ieq<float, true> + { + static FORCEINLINE bool map(const Phanes::Core::Math::TVector4<float, true>& v1, const Phanes::Core::Math::TVector4<float, true>& v2) + { + float r; + _mm_store_ps1(&r, _mm_cmpneq_ps(v1.comp, v2.comp)); + return (r == 0xffffffff) ? true : false; + } + }; +} + diff --git a/Engine/Source/Runtime/Core/public/Math/SIMD/Platform.h b/Engine/Source/Runtime/Core/public/Math/SIMD/Platform.h index c15b890..a2b8b61 100644 --- a/Engine/Source/Runtime/Core/public/Math/SIMD/Platform.h +++ b/Engine/Source/Runtime/Core/public/Math/SIMD/Platform.h @@ -254,11 +254,6 @@ // Define also supported instruction sets for Visual Studio, as it only defines the latest (e.g. only __AVX__ not __SSE4__ ...). -#define P_AVX2__ 0 -#define P_AVX__ 0 -#define P_SSE__ 0 -#define P_NEON__ 0 - #ifdef P_FORCE_INTRINSICS # undef __AVX2__ @@ -272,7 +267,7 @@ #else # ifdef __AVX2__ -# define P_AVX2__ 1 +# define P_AVX2__ 1 # elif defined(__AVX__) # define P_AVX__ 1 # elif defined(__SSE__) @@ -290,6 +285,23 @@ # define P_SSE__ 1 #endif +// Deactivate unset SIMD +#ifndef P_AVX2__ +# define P_AVX2__ 0 +#endif + +// Deactivate unset SIMD +#ifndef P_AVX__ +# define P_AVX__ 0 +#endif + +#ifndef P_SSE__ +# define P_SSE__ 0 +#endif + +#ifndef P_NEON__ +# define P_NEON__ 0 +#endif #define P_INTRINSICS_FPU 0 #define P_INTRINSICS_SSE 1 diff --git a/Engine/Source/Runtime/Core/public/Math/Vector4.hpp b/Engine/Source/Runtime/Core/public/Math/Vector4.hpp index 63bcd2f..81f771d 100644 --- a/Engine/Source/Runtime/Core/public/Math/Vector4.hpp +++ b/Engine/Source/Runtime/Core/public/Math/Vector4.hpp @@ -90,13 +90,6 @@ namespace Phanes::Core::Math /// <param name="comp">Array of at least 4 components</param> TVector4(const Real* comp); - /// <summary> - /// Construct the vector, by calculating the way between two points. - /// </summary> - /// <param name="start">Starting point of the vector.</param> - /// <param name="end">End point of the vector.</param> - TVector4(const TPoint4<Real>& start, const TPoint4<Real>& end); - }; // ===================== // @@ -698,8 +691,5 @@ namespace Phanes::Core::Math TVector4<T, A> PrespectiveDivideV(TVector4<T, A>& v1); } -// No SIMD -#include "Core/public/Math/Vector4.inl" -// SIMD -#include "Core/public/Math/SIMD/SIMDIntrinsics.h" \ No newline at end of file +#include "Core/public/Math/Vector4.inl" diff --git a/Engine/Source/Runtime/Core/public/Math/Vector4.inl b/Engine/Source/Runtime/Core/public/Math/Vector4.inl index 4f2cfcc..f419752 100644 --- a/Engine/Source/Runtime/Core/public/Math/Vector4.inl +++ b/Engine/Source/Runtime/Core/public/Math/Vector4.inl @@ -3,8 +3,13 @@ #include "Core/public/Math/Boilerplate.h" #include "Core/public/Math/Detail/Vector4Decl.inl" +#include "Core/public/Math/SIMD/SIMDIntrinsics.h" + #include "Core/public/Math/Vector4.hpp" + +#include "Core/public/Math/SIMD/PhanesSIMDTypes.h" + #include <stdio.h> namespace Phanes::Core::Math @@ -25,6 +30,30 @@ namespace Phanes::Core::Math w(_w) {} + template<RealType T, bool A> + Phanes::Core::Math::TVector4<T, A>::TVector4(Real s) : + x(s), + y(s), + z(s), + w(s) + {} + + template<RealType T, bool A> + Phanes::Core::Math::TVector4<T, A>::TVector4(const TVector2<Real>& v1, const TVector2<Real>& v2) : + x(v1.x), + y(v1.y), + z(v2.x), + w(v2.y) + {} + + template<RealType T, bool A> + Phanes::Core::Math::TVector4<T, A>::TVector4(const Real* comp) : + x(comp[0]), + y(comp[1]), + z(comp[2]), + w(comp[3]) + {} + template<RealType T, bool A> TVector4<T, A> operator+=(TVector4<T, A>& v1, const TVector4<T, A>& v2) { @@ -159,19 +188,87 @@ namespace Phanes::Core::Math return Detail::compute_vec4_ieq<T, A>::map(v1, v2); } + + + // Inc- / Decrement + template<RealType T, bool A> + TVector4<T, A>& operator++(TVector4<T, A>& v1) + { + ++v1.x; + ++v1.y; + ++v1.z; + ++v1.w; - // SIMD + return v1; + } + template<RealType T, bool A> + TVector4<T, A>& operator--(TVector4<T, A>& v1) + { + --v1.x; + --v1.y; + --v1.z; + --v1.w; + + return v1; + } + + template<RealType T, bool A> + TVector4<T, A>& operator++(TVector4<T, A>& v1, int) + { + return ++v1; + } + + template<RealType T, bool A> + TVector4<T, A>& operator--(TVector4<T, A>& v1, int) + { + return --v1; + } + + + // SIMD constructor template<> - TVector4<float, true>::TVector4(Real _x, Real _y, Real _z, Real _w) : + TVector4<float, true>::TVector4(const TVector4<float, true>& v) + { + this->comp = _mm_load_ps(reinterpret_cast<const float*>(&v)); + } + + template<> + TVector4<float, true>::TVector4(float _x, float _y, float _z, float _w) : x(_x), y(_y), z(_z), w(_w) { - this->comp = _mm_load_ps(reinterpret_cast<float*>(&x)); + this->comp = _mm_load_ps(reinterpret_cast<float*>(&this->x)); + } + + template<> + TVector4<float, true>::TVector4(float s) + { + this->comp = _mm_load_ps1(&s); + } + + template<> + TVector4<float, true>::TVector4(const TVector2<float>& v1, const TVector2<float>& v2) : + x(v1.x), + y(v1.y), + z(v2.x), + w(v2.y) + { + this->comp = _mm_load_ps(reinterpret_cast<float*>(&this->x)); + } + + template<> + TVector4<float, true>::TVector4(const float* comp) : + x(comp[0]), + y(comp[1]), + z(comp[2]), + w(comp[3]) + { + this->comp = _mm_load_ps(reinterpret_cast<float*>(&this->x)); } } \ No newline at end of file