From c58e05373febdfd1a4dc53c53bb82cd5281bcc8d Mon Sep 17 00:00:00 2001 From: THoehne <77296181+THoehne@users.noreply.github.com> Date: Wed, 28 Aug 2024 14:26:40 +0200 Subject: [PATCH] SIMD improvements. --- .../public/Math/SIMD/PhanesVectorMathSSE.hpp | 150 ++++++++++++++++++ 1 file changed, 150 insertions(+) diff --git a/Engine/Source/Runtime/Core/public/Math/SIMD/PhanesVectorMathSSE.hpp b/Engine/Source/Runtime/Core/public/Math/SIMD/PhanesVectorMathSSE.hpp index cda1c1b..77f78e0 100644 --- a/Engine/Source/Runtime/Core/public/Math/SIMD/PhanesVectorMathSSE.hpp +++ b/Engine/Source/Runtime/Core/public/Math/SIMD/PhanesVectorMathSSE.hpp @@ -102,6 +102,16 @@ namespace Phanes::Core::Math::SIMD { return _mm_cmpeq_pd(v1, v2); } + + /// + /// Sets the last component of the register to zero.
+ /// The last component could hold unexpected values. + ///
+ /// + void vec3_fix(Phanes::Core::Types::Vec4f32Reg v1) + { + v1 = _mm_blend_ps(v1, _mm_setzero_ps(), 0x1); + } } @@ -175,6 +185,11 @@ namespace Phanes::Core::Math::Detail { r.comp = _mm_sub_ps(v1.comp, _mm_set_ps1(s)); } + + static FORCEINLINE void map(Phanes::Core::Math::TVector4& r, float s, const Phanes::Core::Math::TVector4& v1) + { + r.comp = _mm_sub_ps(_mm_set_ps1(s), v1.comp); + } }; template<> @@ -203,6 +218,11 @@ namespace Phanes::Core::Math::Detail { r.comp = _mm_div_ps(v1.comp, _mm_set_ps1(s)); } + + static FORCEINLINE void map(Phanes::Core::Math::TVector4& r, float s, const Phanes::Core::Math::TVector4& v1) + { + r.comp = _mm_div_ps(_mm_set_ps1(s), v1.comp); + } }; template<> @@ -223,6 +243,62 @@ namespace Phanes::Core::Math::Detail } }; + template<> + struct compute_vec4_mag + { + static FORCEINLINE float map(const Phanes::Core::Math::TVector4& v1) + { + __m128 tmp = _mm_mul_ps(v1.data, v1.data); + return sqrt(tmp.m128_f32[0] + tmp.m128_f32[1] + tmp.m128_f32[2] + tmp.m128_f32[3]); + } + }; + + template<> + struct compute_vec4_dotp + { + static FORCEINLINE float map(const Phanes::Core::Math::TVector4& v1, const Phanes::Core::Math::TVector4& v2) + { + return SIMD::vec4_dot_cvtf32(v1.data, v2.data); + } + }; + + template<> + struct compute_vec4_set + { + static FORCEINLINE void map(Phanes::Core::Math::TVector4& v1, float x, float y, float z, float w) + { + v1.data = _mm_setr_ps(x, y, z, w); + } + }; + + template<> + struct compute_vec4_max + { + static FORCEINLINE void map(Phanes::Core::Math::TVector4& r, const Phanes::Core::Math::TVector4& v1, const Phanes::Core::Math::TVector4& v2) + { + r.data = _mm_max_ps(v1.data, v2.data); + } + }; + + template<> + struct compute_vec4_min + { + static FORCEINLINE void map(Phanes::Core::Math::TVector4& r, const Phanes::Core::Math::TVector4& v1, const Phanes::Core::Math::TVector4& v2) + { + r.data = _mm_min_ps(v1.data, v2.data); + } + }; + + template<> + struct compute_vec4_pdiv + { + static FORCEINLINE void map(Phanes::Core::Math::TVector4& r, const Phanes::Core::Math::TVector4& v1) + { + __m128 tmp = _mm_div_ps(v1.data, _mm_set_ps1(v1.w)); + r.data = _mm_blend_ps(tmp, _mm_setzero_ps(), 0x1); + } + }; + // ============ // // TVector3 // @@ -260,6 +336,14 @@ namespace Phanes::Core::Math::Detail } }; + template<> + struct compute_vec3_set + { + static FORCEINLINE void map(Phanes::Core::Math::TVector3& v1, float x, float y, float z) + { + v1.data = _mm_setr_ps(x, y, z, 0.0f); + } + }; template<> struct compute_vec3_add : public compute_vec4_add {}; template<> struct compute_vec3_sub : public compute_vec4_sub {}; @@ -267,6 +351,10 @@ namespace Phanes::Core::Math::Detail template<> struct compute_vec3_div : public compute_vec4_div {}; template<> struct compute_vec3_inc : public compute_vec4_inc {}; template<> struct compute_vec3_dec : public compute_vec4_dec {}; + template<> struct compute_vec3_mag : public compute_vec4_mag {}; + template<> struct compute_vec3_dotp : public compute_vec4_dotp {}; + template<> struct compute_vec3_max : public compute_vec4_max {}; + template<> struct compute_vec3_min : public compute_vec4_min {}; template<> struct compute_vec3_cross_p @@ -336,6 +424,11 @@ namespace Phanes::Core::Math::Detail { r.comp = _mm_sub_pd(v1.comp, _mm_set1_pd(s)); } + + static FORCEINLINE void map(Phanes::Core::Math::TVector2& r, double s, const Phanes::Core::Math::TVector2& v1) + { + r.comp = _mm_sub_pd(_mm_set1_pd(s), v1.comp); + } }; template<> @@ -364,6 +457,11 @@ namespace Phanes::Core::Math::Detail { r.comp = _mm_div_pd(v1.comp, _mm_set1_pd(s)); } + + static FORCEINLINE void map(Phanes::Core::Math::TVector2& r, double s, const Phanes::Core::Math::TVector2& v1) + { + r.comp = _mm_div_pd(_mm_set1_pd(s), v1.comp); + } }; template<> @@ -384,6 +482,58 @@ namespace Phanes::Core::Math::Detail } }; + // Magnitude + template + struct compute_vec2_mag + { + static FORCEINLINE double map(const Phanes::Core::Math::TVector2& v1) + { + __m128d tmp = _mm_mul_pd(v1.data, v1.data); + return sqrt(tmp.m128d_f64[0] + tmp.m128d_f64[1]); + } + }; + + // Dot product + template<> + struct compute_vec2_dotp + { + static FORCEINLINE double map(const Phanes::Core::Math::TVector2& v1) + { + __m128d tmp = _mm_mul_pd(v1.data, v1.data); + return tmp.m128d_f64[0] + tmp.m128d_f64[1]; + } + }; + + // Max + template<> + struct compute_vec2_max + { + static FORCEINLINE void map(Phanes::Core::Math::TVector2& r, const Phanes::Core::Math::TVector2& v1, const Phanes::Core::Math::TVector2& v2) + { + r.data = _mm_max_pd(v1.data, v2.data); + } + }; + + // Min + template<> + struct compute_vec2_min + { + static FORCEINLINE void map(Phanes::Core::Math::TVector2& r, const Phanes::Core::Math::TVector2& v1, const Phanes::Core::Math::TVector2& v2) + { + r.data = _mm_min_pd(v1.data, v2.data); + } + }; + + // Set + template<> + struct compute_vec2_set + { + static FORCEINLINE void map(Phanes::Core::Math::TVector2& v1, double x, double y) + { + v1.data = _mm_setr_pd(x, y); + } + }; + // =============== // // TIntVector4 //