From c58e05373febdfd1a4dc53c53bb82cd5281bcc8d Mon Sep 17 00:00:00 2001
From: THoehne <77296181+THoehne@users.noreply.github.com>
Date: Wed, 28 Aug 2024 14:26:40 +0200
Subject: [PATCH] SIMD improvements.

---
 .../public/Math/SIMD/PhanesVectorMathSSE.hpp  | 150 ++++++++++++++++++
 1 file changed, 150 insertions(+)
diff --git a/Engine/Source/Runtime/Core/public/Math/SIMD/PhanesVectorMathSSE.hpp b/Engine/Source/Runtime/Core/public/Math/SIMD/PhanesVectorMathSSE.hpp
index cda1c1b..77f78e0 100644
--- a/Engine/Source/Runtime/Core/public/Math/SIMD/PhanesVectorMathSSE.hpp
+++ b/Engine/Source/Runtime/Core/public/Math/SIMD/PhanesVectorMathSSE.hpp
@@ -102,6 +102,16 @@ namespace Phanes::Core::Math::SIMD
     {
         return _mm_cmpeq_pd(v1, v2);
     }
+
+    /// <summary>
+    /// Sets the last component of the register to zero. <br>
+    /// The last component could hold unexpected values.
+    /// </summary>
+    /// <param name="v1"></param>
+    void vec3_fix(Phanes::Core::Types::Vec4f32Reg v1)
+    {
+        v1 = _mm_blend_ps(v1, _mm_setzero_ps(), 0x1);
+    }
 }
 
 
@@ -175,6 +185,11 @@ namespace Phanes::Core::Math::Detail
         {
             r.comp = _mm_sub_ps(v1.comp, _mm_set_ps1(s));
         }
+
+        static FORCEINLINE void map(Phanes::Core::Math::TVector4<float, true>& r, float s, const Phanes::Core::Math::TVector4<float, true>& v1)
+        {
+            r.comp = _mm_sub_ps(_mm_set_ps1(s), v1.comp);
+        }
     };
 
     template<>
@@ -203,6 +218,11 @@ namespace Phanes::Core::Math::Detail
         {
             r.comp = _mm_div_ps(v1.comp, _mm_set_ps1(s));
         }
+
+        static FORCEINLINE void map(Phanes::Core::Math::TVector4<float, true>& r, float s, const Phanes::Core::Math::TVector4<float, true>& v1)
+        {
+            r.comp = _mm_div_ps(_mm_set_ps1(s), v1.comp);
+        }
     };
 
     template<>
@@ -223,6 +243,62 @@ namespace Phanes::Core::Math::Detail
         }
     };
 
+    template<>
+    struct compute_vec4_mag<float, true>
+    {
+        static FORCEINLINE float map(const Phanes::Core::Math::TVector4<float, true>& v1)
+        {
+            __m128 tmp = _mm_mul_ps(v1.data, v1.data);
+            return sqrt(tmp.m128_f32[0] + tmp.m128_f32[1] + tmp.m128_f32[2] + tmp.m128_f32[3]);
+        }
+    };
+
+    template<>
+    struct compute_vec4_dotp<float, true>
+    {
+        static FORCEINLINE float map(const Phanes::Core::Math::TVector4<float, true>& v1, const Phanes::Core::Math::TVector4<float, true>& v2)
+        {
+            return SIMD::vec4_dot_cvtf32(v1.data, v2.data);
+        }
+    };
+
+    template<>
+    struct compute_vec4_set<float, true>
+    {
+        static FORCEINLINE void map(Phanes::Core::Math::TVector4<float, true>& v1, float x, float y, float z, float w)
+        {
+            v1.data = _mm_setr_ps(x, y, z, w);
+        }
+    };
+
+    template<>
+    struct compute_vec4_max<float, true>
+    {
+        static FORCEINLINE void map(Phanes::Core::Math::TVector4<float, true>& r, const Phanes::Core::Math::TVector4<float, true>& v1, const Phanes::Core::Math::TVector4<float, true>& v2)
+        {
+            r.data = _mm_max_ps(v1.data, v2.data);
+        }
+    };
+
+    template<>
+    struct compute_vec4_min<float, true>
+    {
+        static FORCEINLINE void map(Phanes::Core::Math::TVector4<float, true>& r, const Phanes::Core::Math::TVector4<float, true>& v1, const Phanes::Core::Math::TVector4<float, true>& v2)
+        {
+            r.data = _mm_min_ps(v1.data, v2.data);
+        }
+    };
+
+    template<>
+    struct compute_vec4_pdiv<float, true>
+    {
+        static FORCEINLINE void map(Phanes::Core::Math::TVector4<float, true>& r, const Phanes::Core::Math::TVector4<float, true>& v1)
+        {
+            __m128 tmp = _mm_div_ps(v1.data, _mm_set_ps1(v1.w));
+            r.data = _mm_blend_ps(tmp, _mm_setzero_ps(), 0x1);
+        }
+    };
+
 
     // ============ //
     //   TVector3   //
@@ -260,6 +336,14 @@ namespace Phanes::Core::Math::Detail
         }
     };
 
+    template<>
+    struct compute_vec3_set<float, true>
+    {
+        static FORCEINLINE void map(Phanes::Core::Math::TVector3<float, true>& v1, float x, float y, float z)
+        {
+            v1.data = _mm_setr_ps(x, y, z, 0.0f);
+        }
+    };
 
     template<> struct compute_vec3_add<float, true> : public compute_vec4_add<float, true> {};
     template<> struct compute_vec3_sub<float, true> : public compute_vec4_sub<float, true> {};
@@ -267,6 +351,10 @@ namespace Phanes::Core::Math::Detail
     template<> struct compute_vec3_div<float, true> : public compute_vec4_div<float, true> {};
     template<> struct compute_vec3_inc<float, true> : public compute_vec4_inc<float, true> {};
     template<> struct compute_vec3_dec<float, true> : public compute_vec4_dec<float, true> {};
+    template<> struct compute_vec3_mag<float, true> : public compute_vec4_mag<float, true> {};
+    template<> struct compute_vec3_dotp<float, true> : public compute_vec4_dotp<float, true> {};
+    template<> struct compute_vec3_max<float, true> : public compute_vec4_max<float, true> {};
+    template<> struct compute_vec3_min<float, true> : public compute_vec4_min<float, true> {};
 
     template<>
     struct compute_vec3_cross_p<float, true>
@@ -336,6 +424,11 @@ namespace Phanes::Core::Math::Detail
         {
             r.comp = _mm_sub_pd(v1.comp, _mm_set1_pd(s));
         }
+
+        static FORCEINLINE void map(Phanes::Core::Math::TVector2<double, true>& r, double s, const Phanes::Core::Math::TVector2<double, true>& v1)
+        {
+            r.comp = _mm_sub_pd(_mm_set1_pd(s), v1.comp);
+        }
     };
 
     template<>
@@ -364,6 +457,11 @@ namespace Phanes::Core::Math::Detail
         {
             r.comp = _mm_div_pd(v1.comp, _mm_set1_pd(s));
         }
+
+        static FORCEINLINE void map(Phanes::Core::Math::TVector2<double, true>& r, double s, const Phanes::Core::Math::TVector2<double, true>& v1)
+        {
+            r.comp = _mm_div_pd(_mm_set1_pd(s), v1.comp);
+        }
     };
 
     template<>
@@ -384,6 +482,58 @@ namespace Phanes::Core::Math::Detail
         }
     };
 
+    // Magnitude 
+    template<RealType T>
+    struct compute_vec2_mag<T, true>
+    {
+        static FORCEINLINE double map(const Phanes::Core::Math::TVector2<double, true>& v1)
+        {
+            __m128d tmp = _mm_mul_pd(v1.data, v1.data);
+            return sqrt(tmp.m128d_f64[0] + tmp.m128d_f64[1]);
+        }
+    };
+
+    // Dot product 
+    template<>
+    struct compute_vec2_dotp<double, true> 
+    {
+        static FORCEINLINE double map(const Phanes::Core::Math::TVector2<double, true>& v1)
+        {
+            __m128d tmp = _mm_mul_pd(v1.data, v1.data);
+            return tmp.m128d_f64[0] + tmp.m128d_f64[1];
+        }
+    };
+
+    // Max 
+    template<>
+    struct compute_vec2_max<double, true> 
+    {
+        static FORCEINLINE void map(Phanes::Core::Math::TVector2<double, true>& r, const Phanes::Core::Math::TVector2<double, true>& v1, const Phanes::Core::Math::TVector2<double, true>& v2)
+        {
+            r.data = _mm_max_pd(v1.data, v2.data);
+        }
+    };
+
+    // Min
+    template<>
+    struct compute_vec2_min<double, true> 
+    {
+        static FORCEINLINE void map(Phanes::Core::Math::TVector2<double, true>& r, const Phanes::Core::Math::TVector2<double, true>& v1, const Phanes::Core::Math::TVector2<double, true>& v2)
+        {
+            r.data = _mm_min_pd(v1.data, v2.data);
+        }
+    };
+
+    // Set
+    template<>
+    struct compute_vec2_set<double, true> 
+    {
+        static FORCEINLINE void map(Phanes::Core::Math::TVector2<double, true>& v1, double x, double y)
+        {
+            v1.data = _mm_setr_pd(x, y);
+        }
+    };
+
 
     // =============== //
     //   TIntVector4   //